From f6ace2bc15bfde4cc9bd140859fa92618568a006 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 29 May 2024 09:51:05 +0100 Subject: [PATCH] [AArch64] Expand vector ops when NEON and SVE are unavailable. (#90833) Unlike `+noneon` we must assume that vector types are available, i.e. it is valid to pass/return vector arguments to and from functions. However, the compiler must make sure to scalarize any vector operations. --- .../Target/AArch64/AArch64ISelLowering.cpp | 79 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 6 +- ...streaming-mode-fixed-length-and-combine.ll | 226 +- ...treaming-mode-fixed-length-bit-counting.ll | 2167 +++++++- ...sve-streaming-mode-fixed-length-bitcast.ll | 30 +- ...e-streaming-mode-fixed-length-bitselect.ll | 32 +- .../sve-streaming-mode-fixed-length-concat.ll | 119 +- ...e-streaming-mode-fixed-length-ext-loads.ll | 338 +- ...ing-mode-fixed-length-extract-subvector.ll | 50 +- ...ng-mode-fixed-length-extract-vector-elt.ll | 54 +- ...e-streaming-mode-fixed-length-fcopysign.ll | 846 ++- ...ve-streaming-mode-fixed-length-fp-arith.ll | 3177 ++++++++--- ...streaming-mode-fixed-length-fp-compares.ll | 4788 +++++++++-------- ...-streaming-mode-fixed-length-fp-convert.ll | 29 +- ...aming-mode-fixed-length-fp-extend-trunc.ll | 732 ++- .../sve-streaming-mode-fixed-length-fp-fma.ll | 569 +- ...e-streaming-mode-fixed-length-fp-minmax.ll | 2040 ++++--- ...eaming-mode-fixed-length-fp-reduce-fa64.ll | 26 +- ...e-streaming-mode-fixed-length-fp-reduce.ll | 1438 +++-- ...streaming-mode-fixed-length-fp-rounding.ll | 2030 ++++++- ...e-streaming-mode-fixed-length-fp-select.ll | 305 +- ...e-streaming-mode-fixed-length-fp-to-int.ll | 2254 ++++++-- ...-streaming-mode-fixed-length-fp-vselect.ll | 511 +- ...ing-mode-fixed-length-insert-vector-elt.ll | 367 +- ...e-streaming-mode-fixed-length-int-arith.ll | 2132 +++++++- ...treaming-mode-fixed-length-int-compares.ll | 1048 +++- ...sve-streaming-mode-fixed-length-int-div.ll | 2044 +++---- ...streaming-mode-fixed-length-int-extends.ll | 3716 ++++++++++--- ...eaming-mode-fixed-length-int-immediates.ll | 3425 +++++++++++- ...sve-streaming-mode-fixed-length-int-log.ll | 1503 +++++- ...-streaming-mode-fixed-length-int-minmax.ll | 2404 ++++++++- ...ing-mode-fixed-length-int-mla-neon-fa64.ll | 47 +- ...ve-streaming-mode-fixed-length-int-mulh.ll | 1664 +++++- ...-streaming-mode-fixed-length-int-reduce.ll | 1642 +++++- ...sve-streaming-mode-fixed-length-int-rem.ll | 2654 ++++----- ...-streaming-mode-fixed-length-int-select.ll | 581 +- ...-streaming-mode-fixed-length-int-shifts.ll | 1632 +++++- ...e-streaming-mode-fixed-length-int-to-fp.ll | 1895 +++++-- ...streaming-mode-fixed-length-int-vselect.ll | 817 ++- ...-streaming-mode-fixed-length-ld2-alloca.ll | 118 +- ...reaming-mode-fixed-length-limit-duplane.ll | 145 +- .../sve-streaming-mode-fixed-length-loads.ll | 33 +- ...-streaming-mode-fixed-length-log-reduce.ll | 888 ++- ...streaming-mode-fixed-length-masked-load.ll | 3314 +++++++++--- ...treaming-mode-fixed-length-masked-store.ll | 806 ++- ...eaming-mode-fixed-length-optimize-ptrue.ll | 937 +++- ...streaming-mode-fixed-length-permute-rev.ll | 472 +- ...g-mode-fixed-length-permute-zip-uzp-trn.ll | 1261 ++++- .../sve-streaming-mode-fixed-length-ptest.ll | 399 +- .../sve-streaming-mode-fixed-length-rev.ll | 936 +++- ...e-streaming-mode-fixed-length-sdiv-pow2.ll | 768 ++- ...sve-streaming-mode-fixed-length-shuffle.ll | 72 +- ...treaming-mode-fixed-length-splat-vector.ll | 245 +- .../sve-streaming-mode-fixed-length-stores.ll | 60 +- ...e-streaming-mode-fixed-length-subvector.ll | 8 +- ...treaming-mode-fixed-length-trunc-stores.ll | 64 +- .../sve-streaming-mode-fixed-length-trunc.ll | 2789 +++++++++- ...eaming-mode-fixed-length-vector-shuffle.ll | 339 +- .../sve-streaming-mode-test-register-mov.ll | 6 +- 59 files changed, 49850 insertions(+), 13227 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 25ba8d8500306f..814bbe27049820 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -360,24 +360,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, if (Subtarget->hasNEON()) { addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); - // Someone set us up the NEON. - addDRTypeForNEON(MVT::v2f32); - addDRTypeForNEON(MVT::v8i8); - addDRTypeForNEON(MVT::v4i16); - addDRTypeForNEON(MVT::v2i32); - addDRTypeForNEON(MVT::v1i64); - addDRTypeForNEON(MVT::v1f64); - addDRTypeForNEON(MVT::v4f16); - addDRTypeForNEON(MVT::v4bf16); - - addQRTypeForNEON(MVT::v4f32); - addQRTypeForNEON(MVT::v2f64); - addQRTypeForNEON(MVT::v16i8); - addQRTypeForNEON(MVT::v8i16); - addQRTypeForNEON(MVT::v4i32); - addQRTypeForNEON(MVT::v2i64); - addQRTypeForNEON(MVT::v8f16); - addQRTypeForNEON(MVT::v8bf16); + + addDRType(MVT::v2f32); + addDRType(MVT::v8i8); + addDRType(MVT::v4i16); + addDRType(MVT::v2i32); + addDRType(MVT::v1i64); + addDRType(MVT::v1f64); + addDRType(MVT::v4f16); + addDRType(MVT::v4bf16); + + addQRType(MVT::v4f32); + addQRType(MVT::v2f64); + addQRType(MVT::v16i8); + addQRType(MVT::v8i16); + addQRType(MVT::v4i32); + addQRType(MVT::v2i64); + addQRType(MVT::v8f16); + addQRType(MVT::v8bf16); } if (Subtarget->hasSVEorSME()) { @@ -1125,7 +1125,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - if (Subtarget->hasNEON()) { + if (Subtarget->isNeonAvailable()) { // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to // silliness like this: for (auto Op : @@ -1337,6 +1337,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, // FADDP custom lowering for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 }) setOperationAction(ISD::FADD, VT, Custom); + } else /* !isNeonAvailable */ { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { + for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) + setOperationAction(Op, VT, Expand); + + if (VT.is128BitVector() || VT.is64BitVector()) { + setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::STORE, VT, Legal); + setOperationAction(ISD::BITCAST, VT, + Subtarget->isLittleEndian() ? Legal : Expand); + } + for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { + setTruncStoreAction(VT, InnerVT, Expand); + setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); + setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); + } + } } if (Subtarget->hasSME()) { @@ -2020,14 +2038,16 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::ZERO_EXTEND, VT, Default); } -void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { +void AArch64TargetLowering::addDRType(MVT VT) { addRegisterClass(VT, &AArch64::FPR64RegClass); - addTypeForNEON(VT); + if (Subtarget->isNeonAvailable()) + addTypeForNEON(VT); } -void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { +void AArch64TargetLowering::addQRType(MVT VT) { addRegisterClass(VT, &AArch64::FPR128RegClass); - addTypeForNEON(VT); + if (Subtarget->isNeonAvailable()) + addTypeForNEON(VT); } EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, @@ -9445,7 +9465,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { - if (!Subtarget->hasNEON()) + if (!Subtarget->isNeonAvailable() && + !Subtarget->useSVEForFixedLengthVectors()) return SDValue(); EVT VT = Op.getValueType(); @@ -14141,6 +14162,13 @@ SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi); } +bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles( + EVT VT, unsigned DefinedValues) const { + if (!Subtarget->isNeonAvailable()) + return false; + return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues); +} + bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { // Currently no fixed length shuffles that require SVE are legal. if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) @@ -19838,7 +19866,8 @@ performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // help, for example, to produce ssra from sshr+add. static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - if (VT != MVT::i64) + if (VT != MVT::i64 || + DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64)) return SDValue(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index a44a3d35d2f9c8..73bc9ad53bb8a3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1017,8 +1017,10 @@ class AArch64TargetLowering : public TargetLowering { void addTypeForNEON(MVT VT); void addTypeForFixedLengthSVE(MVT VT); - void addDRTypeForNEON(MVT VT); - void addQRTypeForNEON(MVT VT); + void addDRType(MVT VT); + void addQRType(MVT VT); + + bool shouldExpandBuildVectorWithShuffles(EVT, unsigned) const override; unsigned allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll index ed3222529a3bb9..4cdb175f55c9cc 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll @@ -18,8 +18,15 @@ define <4 x i8> @vls_sve_and_4xi8(<4 x i8> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_4xi8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0xff000000ff0000 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #12] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %c = and <4 x i8> %b, ret <4 x i8> %c @@ -37,8 +44,21 @@ define <8 x i8> @vls_sve_and_8xi8(<8 x i8> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_8xi8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0xff00ff00ff00ff00 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #14] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %c = and <8 x i8> %b, ret <8 x i8> %c @@ -56,8 +76,33 @@ define <16 x i8> @vls_sve_and_16xi8(<16 x i8> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_16xi8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.2d, #0xff00ff00ff00ff00 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #30] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #28] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #26] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %c = and <16 x i8> %b, ret <16 x i8> %c @@ -78,9 +123,57 @@ define <32 x i8> @vls_sve_and_32xi8(<32 x i8> %ap) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_32xi8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v2.2d, #0xff00ff00ff00ff00 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #46] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #62] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #60] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #58] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #56] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #54] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #52] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #50] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = and <32 x i8> %ap, @@ -102,9 +195,11 @@ define <2 x i16> @vls_sve_and_2xi16(<2 x i16> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_2xi16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov v0.s[0], wzr -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %c = and <2 x i16> %b, ret <2 x i16> %c @@ -122,8 +217,15 @@ define <4 x i16> @vls_sve_and_4xi16(<4 x i16> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_4xi16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0xffff0000ffff0000 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #12] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %c = and <4 x i16> %b, ret <4 x i16> %c @@ -141,8 +243,21 @@ define <8 x i16> @vls_sve_and_8xi16(<8 x i16> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_8xi16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.2d, #0xffff0000ffff0000 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #28] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %c = and <8 x i16> %b, ret <8 x i16> %c @@ -163,9 +278,33 @@ define <16 x i16> @vls_sve_and_16xi16(<16 x i16> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_16xi16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v2.2d, #0xffff0000ffff0000 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #44] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #60] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #52] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %c = and <16 x i16> %b, ret <16 x i16> %c @@ -183,9 +322,11 @@ define <2 x i32> @vls_sve_and_2xi32(<2 x i32> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_2xi32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov v0.s[0], wzr -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %c = and <2 x i32> %b, ret <2 x i32> %c @@ -203,8 +344,13 @@ define <4 x i32> @vls_sve_and_4xi32(<4 x i32> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_4xi32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.2d, #0xffffffff00000000 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %c = and <4 x i32> %b, ret <4 x i32> %c @@ -225,9 +371,17 @@ define <8 x i32> @vls_sve_and_8xi32(<8 x i32> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_8xi32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v2.2d, #0xffffffff00000000 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %c = and <8 x i32> %b, ret <8 x i32> %c @@ -245,7 +399,11 @@ define <2 x i64> @vls_sve_and_2xi64(<2 x i64> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_2xi64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov v0.d[0], xzr +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: stp xzr, x8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %c = and <2 x i64> %b, ret <2 x i64> %c @@ -265,8 +423,16 @@ define <4 x i64> @vls_sve_and_4xi64(<4 x i64> %b) nounwind { ; ; NONEON-NOSVE-LABEL: vls_sve_and_4xi64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov v0.d[0], xzr -; NONEON-NOSVE-NEXT: mov v1.d[0], xzr +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #40] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp xzr, x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: stp xzr, x8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %c = and <4 x i64> %b, ret <4 x i64> %c diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll index cd6c2b489efe4c..f920efeb4892d1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll @@ -22,12 +22,26 @@ define <4 x i8> @ctlz_v4i8(<4 x i8> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: mov w8, #8 // =0x8 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: clz v0.4h, v0.4h -; NONEON-NOSVE-NEXT: sub v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #2] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w9 +; NONEON-NOSVE-NEXT: clz w10, w10 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: sub w9, w9, #24 +; NONEON-NOSVE-NEXT: sub w10, w10, #24 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: clz w8, w11 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w10, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.ctlz.v4i8(<4 x i8> %op) ret <4 x i8> %res @@ -44,7 +58,42 @@ define <8 x i8> @ctlz_v8i8(<8 x i8> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: clz v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op) ret <8 x i8> %res @@ -61,7 +110,74 @@ define <16 x i8> @ctlz_v16i8(<16 x i8> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: clz v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op) ret <16 x i8> %res @@ -79,10 +195,140 @@ define void @ctlz_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctlz_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: clz v0.16b, v0.16b -; NONEON-NOSVE-NEXT: clz v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op) @@ -103,12 +349,17 @@ define <2 x i16> @ctlz_v2i16(<2 x i16> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: dup v1.2s, w8 -; NONEON-NOSVE-NEXT: clz v0.2s, v0.2s -; NONEON-NOSVE-NEXT: sub v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w9 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: sub w9, w9, #16 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %op) ret <2 x i16> %res @@ -125,7 +376,26 @@ define <4 x i16> @ctlz_v4i16(<4 x i16> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: clz v0.4h, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op) ret <4 x i16> %res @@ -142,7 +412,42 @@ define <8 x i16> @ctlz_v8i16(<8 x i16> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: clz v0.8h, v0.8h +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op) ret <8 x i16> %res @@ -160,10 +465,76 @@ define void @ctlz_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctlz_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: clz v0.8h, v0.8h -; NONEON-NOSVE-NEXT: clz v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: sub w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op) @@ -182,7 +553,15 @@ define <2 x i32> @ctlz_v2i32(<2 x i32> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: clz v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op) ret <2 x i32> %res @@ -199,7 +578,20 @@ define <4 x i32> @ctlz_v4i32(<4 x i32> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: clz v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op) ret <4 x i32> %res @@ -217,10 +609,32 @@ define void @ctlz_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctlz_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: clz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: clz v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op) @@ -239,23 +653,13 @@ define <1 x i64> @ctlz_v1i64(<1 x i64> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushr d1, d0, #1 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushr d1, d0, #2 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushr d1, d0, #4 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushr d1, d0, #8 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushr d1, d0, #16 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushr d1, d0, #32 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: mvn v0.8b, v0.8b -; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h -; NONEON-NOSVE-NEXT: uaddlp v0.1d, v0.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op) ret <1 x i64> %res @@ -272,23 +676,15 @@ define <2 x i64> @ctlz_v2i64(<2 x i64> %op) { ; ; NONEON-NOSVE-LABEL: ctlz_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #1 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #2 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #4 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #8 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #16 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: ushr v1.2d, v0.2d, #32 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: clz x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op) ret <2 x i64> %res @@ -306,42 +702,22 @@ define void @ctlz_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctlz_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #1 -; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #1 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #2 -; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #2 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #4 -; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #4 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #8 -; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #8 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #16 -; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #16 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: ushr v2.2d, v0.2d, #32 -; NONEON-NOSVE-NEXT: ushr v3.2d, v1.2d, #32 -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b -; NONEON-NOSVE-NEXT: mvn v1.16b, v1.16b -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v1.4s, v1.8h -; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s -; NONEON-NOSVE-NEXT: uaddlp v1.2d, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: clz x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: clz x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op) @@ -365,10 +741,37 @@ define <4 x i8> @ctpop_v4i8(<4 x i8> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #66] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: fmov d1, x9 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #64] +; NONEON-NOSVE-NEXT: fmov d2, x10 +; NONEON-NOSVE-NEXT: fmov d3, x8 ; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b +; NONEON-NOSVE-NEXT: cnt v1.8b, v1.8b +; NONEON-NOSVE-NEXT: cnt v2.8b, v2.8b +; NONEON-NOSVE-NEXT: cnt v3.8b, v3.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h1, v1.8b +; NONEON-NOSVE-NEXT: uaddlv h2, v2.8b +; NONEON-NOSVE-NEXT: uaddlv h3, v3.8b +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: stp q3, q2, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #72] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.ctpop.v4i8(<4 x i8> %op) ret <4 x i8> %res @@ -385,7 +788,67 @@ define <8 x i8> @ctpop_v8i8(<8 x i8> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v8i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 +; NONEON-NOSVE-NEXT: str d0, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #135] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #134] ; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #133] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #132] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #131] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #130] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #129] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #128] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: strb w8, [sp, #143] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: strb w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #141] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: strb w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #139] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #137] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #136] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op) ret <8 x i8> %res @@ -402,7 +865,126 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #304 +; NONEON-NOSVE-NEXT: str x29, [sp, #288] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 304 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str q0, [sp, #256] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #288] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #271] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #270] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #240] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #269] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #224] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #268] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #208] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #267] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #192] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #266] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #176] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #265] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #160] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #264] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #263] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #262] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #261] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #260] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #259] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #258] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #257] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #256] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #240] +; NONEON-NOSVE-NEXT: strb w8, [sp, #287] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #224] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: strb w8, [sp, #286] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #208] +; NONEON-NOSVE-NEXT: strb w8, [sp, #285] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: strb w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] +; NONEON-NOSVE-NEXT: strb w8, [sp, #283] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #282] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] +; NONEON-NOSVE-NEXT: strb w8, [sp, #281] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] +; NONEON-NOSVE-NEXT: strb w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: strb w8, [sp, #279] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: strb w8, [sp, #278] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #277] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: strb w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #275] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #274] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #273] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #272] +; NONEON-NOSVE-NEXT: add sp, sp, #304 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op) ret <16 x i8> %res @@ -420,10 +1002,240 @@ define void @ctpop_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctpop_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b +; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #576 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 592 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #512] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #543] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #542] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #240] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #541] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #224] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #540] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #208] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #539] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #192] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #538] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #176] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #537] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #160] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #536] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #535] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #534] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #533] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #532] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #531] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #530] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #529] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #528] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #527] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #526] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #496] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #525] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #480] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #524] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #464] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #523] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #448] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #522] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #432] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #521] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #416] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #520] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #400] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #519] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #384] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #518] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #368] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #517] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #352] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #516] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #336] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #515] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #320] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #514] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #304] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #513] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #288] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #512] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #272] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #240] +; NONEON-NOSVE-NEXT: strb w8, [sp, #575] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #224] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: strb w8, [sp, #574] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #208] +; NONEON-NOSVE-NEXT: strb w8, [sp, #573] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: strb w8, [sp, #572] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] +; NONEON-NOSVE-NEXT: strb w8, [sp, #571] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] +; NONEON-NOSVE-NEXT: str q0, [sp, #256] +; NONEON-NOSVE-NEXT: strb w8, [sp, #570] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] +; NONEON-NOSVE-NEXT: strb w8, [sp, #569] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] +; NONEON-NOSVE-NEXT: strb w8, [sp, #568] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: strb w8, [sp, #567] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: strb w8, [sp, #566] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #565] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: strb w8, [sp, #564] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #563] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #562] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #561] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #560] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #496] +; NONEON-NOSVE-NEXT: strb w8, [sp, #559] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #480] +; NONEON-NOSVE-NEXT: strb w8, [sp, #558] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #464] +; NONEON-NOSVE-NEXT: strb w8, [sp, #557] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #448] +; NONEON-NOSVE-NEXT: strb w8, [sp, #556] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #432] +; NONEON-NOSVE-NEXT: strb w8, [sp, #555] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #416] +; NONEON-NOSVE-NEXT: strb w8, [sp, #554] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #400] +; NONEON-NOSVE-NEXT: strb w8, [sp, #553] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #384] +; NONEON-NOSVE-NEXT: strb w8, [sp, #552] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #368] +; NONEON-NOSVE-NEXT: strb w8, [sp, #551] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #352] +; NONEON-NOSVE-NEXT: strb w8, [sp, #550] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #336] +; NONEON-NOSVE-NEXT: strb w8, [sp, #549] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #320] +; NONEON-NOSVE-NEXT: strb w8, [sp, #548] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #304] +; NONEON-NOSVE-NEXT: strb w8, [sp, #547] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #288] +; NONEON-NOSVE-NEXT: strb w8, [sp, #546] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #272] +; NONEON-NOSVE-NEXT: strb w8, [sp, #545] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #256] +; NONEON-NOSVE-NEXT: strb w8, [sp, #544] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #544] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #576 +; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op) @@ -443,11 +1255,23 @@ define <2 x i16> @ctpop_v2i16(<2 x i16> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: str d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: fmov d1, x9 ; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h +; NONEON-NOSVE-NEXT: cnt v1.8b, v1.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h1, v1.8b +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %op) ret <2 x i16> %res @@ -464,8 +1288,39 @@ define <4 x i16> @ctpop_v4i16(<4 x i16> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v4i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #64] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #68] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #66] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #64] ; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #72] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op) ret <4 x i16> %res @@ -482,8 +1337,67 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #142] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #140] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #138] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #136] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #134] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #132] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #130] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #128] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #144] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op) ret <8 x i16> %res @@ -501,12 +1415,128 @@ define void @ctpop_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctpop_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #336 +; NONEON-NOSVE-NEXT: str x29, [sp, #320] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 336 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #320] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #256] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #286] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #284] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #282] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #280] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #278] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #276] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #274] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #272] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #270] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #268] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #240] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #266] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #224] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #264] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #208] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #262] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #192] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #260] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #176] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #258] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #160] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #256] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #318] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: strh w8, [sp, #316] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #314] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: strh w8, [sp, #312] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #310] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: strh w8, [sp, #308] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #306] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #240] +; NONEON-NOSVE-NEXT: strh w8, [sp, #302] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #208] +; NONEON-NOSVE-NEXT: strh w8, [sp, #298] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] +; NONEON-NOSVE-NEXT: strh w8, [sp, #296] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] +; NONEON-NOSVE-NEXT: strh w8, [sp, #294] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] +; NONEON-NOSVE-NEXT: strh w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] +; NONEON-NOSVE-NEXT: strh w8, [sp, #290] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] +; NONEON-NOSVE-NEXT: strh w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #288] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #336 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op) @@ -525,9 +1555,24 @@ define <2 x i32> @ctpop_v2i32(<2 x i32> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v2i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: str d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] ; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op) ret <2 x i32> %res @@ -544,9 +1589,37 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #80] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op) ret <4 x i32> %res @@ -564,14 +1637,65 @@ define void @ctpop_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctpop_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v1.4s, v1.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #192 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 192 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #128] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #156] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #152] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #140] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #132] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #80] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #160] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #192 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op) @@ -590,10 +1714,15 @@ define <1 x i64> @ctpop_v1i64(<1 x i64> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h -; NONEON-NOSVE-NEXT: uaddlp v0.1d, v0.2s +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op) ret <1 x i64> %res @@ -610,10 +1739,23 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) { ; ; NONEON-NOSVE-LABEL: ctpop_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str x8, [sp, #56] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op) ret <2 x i64> %res @@ -631,16 +1773,37 @@ define void @ctpop_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: ctpop_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v1.4s, v1.8h -; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s -; NONEON-NOSVE-NEXT: uaddlp v1.2d, v1.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #88] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str x8, [sp, #120] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str x8, [sp, #112] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str x8, [sp, #104] +; NONEON-NOSVE-NEXT: uaddlv h0, v0.8b +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: str x8, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op) @@ -665,17 +1828,30 @@ define <4 x i8> @cttz_v4i8(<4 x i8> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #256 // =0x100 -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v2.4h, w8 -; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: sub v1.4h, v0.4h, v2.4h -; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: clz v0.4h, v0.4h -; NONEON-NOSVE-NEXT: sub v0.4h, v1.4h, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %op) ret <4 x i8> %res @@ -693,10 +1869,50 @@ define <8 x i8> @cttz_v8i8(<8 x i8> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.8b, #1 -; NONEON-NOSVE-NEXT: sub v1.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op) ret <8 x i8> %res @@ -714,10 +1930,90 @@ define <16 x i8> @cttz_v16i8(<16 x i8> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.16b, #1 -; NONEON-NOSVE-NEXT: sub v1.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: bic v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op) ret <16 x i8> %res @@ -737,15 +2033,172 @@ define void @cttz_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: cttz_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #1 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: sub v3.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: sub v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: bic v1.16b, v3.16b, v1.16b -; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x100 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op) @@ -766,17 +2219,19 @@ define <2 x i16> @cttz_v2i16(<2 x i16> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #65536 // =0x10000 -; NONEON-NOSVE-NEXT: dup v1.2s, w8 -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v2.2s, w8 -; NONEON-NOSVE-NEXT: mov w8, #32 // =0x20 -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: sub v1.2s, v0.2s, v2.2s -; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: dup v1.2s, w8 -; NONEON-NOSVE-NEXT: clz v0.2s, v0.2s -; NONEON-NOSVE-NEXT: sub v0.2s, v1.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %op) ret <2 x i16> %res @@ -794,14 +2249,30 @@ define <4 x i16> @cttz_v4i16(<4 x i16> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 -; NONEON-NOSVE-NEXT: sub v1.4h, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: clz v0.4h, v0.4h -; NONEON-NOSVE-NEXT: sub v0.4h, v1.4h, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op) ret <4 x i16> %res @@ -819,14 +2290,50 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v1.8h, w8 -; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 -; NONEON-NOSVE-NEXT: sub v1.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: bic v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: dup v1.8h, w8 -; NONEON-NOSVE-NEXT: clz v0.8h, v0.8h -; NONEON-NOSVE-NEXT: sub v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op) ret <8 x i16> %res @@ -846,20 +2353,92 @@ define void @cttz_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: cttz_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: mov w8, #16 // =0x10 -; NONEON-NOSVE-NEXT: sub v3.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: sub v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: bic v1.16b, v3.16b, v1.16b -; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: dup v2.8h, w8 -; NONEON-NOSVE-NEXT: clz v1.8h, v1.8h -; NONEON-NOSVE-NEXT: clz v0.8h, v0.8h -; NONEON-NOSVE-NEXT: sub v1.8h, v2.8h, v1.8h -; NONEON-NOSVE-NEXT: sub v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x10000 +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op) @@ -879,14 +2458,17 @@ define <2 x i32> @cttz_v2i32(<2 x i32> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v1.2s, w8 -; NONEON-NOSVE-NEXT: mov w8, #32 // =0x20 -; NONEON-NOSVE-NEXT: sub v1.2s, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: dup v1.2s, w8 -; NONEON-NOSVE-NEXT: clz v0.2s, v0.2s -; NONEON-NOSVE-NEXT: sub v0.2s, v1.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op) ret <2 x i32> %res @@ -904,14 +2486,24 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v1.4s, w8 -; NONEON-NOSVE-NEXT: mov w8, #32 // =0x20 -; NONEON-NOSVE-NEXT: sub v1.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: bic v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: dup v1.4s, w8 -; NONEON-NOSVE-NEXT: clz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: sub v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op) ret <4 x i32> %res @@ -931,20 +2523,40 @@ define void @cttz_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: cttz_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: mov w8, #32 // =0x20 -; NONEON-NOSVE-NEXT: sub v3.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: sub v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: bic v1.16b, v3.16b, v1.16b -; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: dup v2.4s, w8 -; NONEON-NOSVE-NEXT: clz v1.4s, v1.4s -; NONEON-NOSVE-NEXT: clz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: sub v1.4s, v2.4s, v1.4s -; NONEON-NOSVE-NEXT: sub v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: clz w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op) @@ -964,14 +2576,14 @@ define <1 x i64> @cttz_v1i64(<1 x i64> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: sub d1, d0, d1 -; NONEON-NOSVE-NEXT: bic v0.8b, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: cnt v0.8b, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.4h, v0.8b -; NONEON-NOSVE-NEXT: uaddlp v0.2s, v0.4h -; NONEON-NOSVE-NEXT: uaddlp v0.1d, v0.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op) ret <1 x i64> %res @@ -989,14 +2601,17 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %op) { ; ; NONEON-NOSVE-LABEL: cttz_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: dup v1.2d, x8 -; NONEON-NOSVE-NEXT: sub v1.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: bic v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op) ret <2 x i64> %res @@ -1016,22 +2631,26 @@ define void @cttz_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: cttz_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #1 // =0x1 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: sub v3.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: sub v0.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: bic v1.16b, v3.16b, v1.16b -; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: cnt v1.16b, v1.16b -; NONEON-NOSVE-NEXT: cnt v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v1.8h, v1.16b -; NONEON-NOSVE-NEXT: uaddlp v0.8h, v0.16b -; NONEON-NOSVE-NEXT: uaddlp v1.4s, v1.8h -; NONEON-NOSVE-NEXT: uaddlp v0.4s, v0.8h -; NONEON-NOSVE-NEXT: uaddlp v1.2d, v1.4s -; NONEON-NOSVE-NEXT: uaddlp v0.2d, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: clz x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll index 7e93ee99ed7494..41065b36020038 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll @@ -15,8 +15,14 @@ define void @bitcast_v4i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: bitcast_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr w8, [x0] -; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: ldrb w8, [x0] +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #2] +; NONEON-NOSVE-NEXT: ldrb w11, [x0, #3] +; NONEON-NOSVE-NEXT: strb w11, [x1, #3] +; NONEON-NOSVE-NEXT: strb w10, [x1, #2] +; NONEON-NOSVE-NEXT: strb w9, [x1, #1] +; NONEON-NOSVE-NEXT: strb w8, [x1] ; NONEON-NOSVE-NEXT: ret %load = load volatile <4 x i8>, ptr %a %cast = bitcast <4 x i8> %load to <4 x i8> @@ -102,12 +108,22 @@ define void @bitcast_v2i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: bitcast_v2i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [x0, #2] +; NONEON-NOSVE-NEXT: str w8, [sp, #4] ; NONEON-NOSVE-NEXT: ldrh w8, [x0] -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: add x8, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] -; NONEON-NOSVE-NEXT: uzp1 v0.4h, v0.4h, v0.4h -; NONEON-NOSVE-NEXT: str s0, [x1] +; NONEON-NOSVE-NEXT: str w8, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %load = load volatile <2 x i16>, ptr %a %cast = bitcast <2 x i16> %load to <2 x half> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll index 6b8077053b590f..b908dd61f24014 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll @@ -34,13 +34,39 @@ define <8 x i32> @fixed_bitselect_v8i32(ptr %pre_cond_ptr, ptr %left_ptr, ptr %r ; ; NONEON-NOSVE-LABEL: fixed_bitselect_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] ; NONEON-NOSVE-NEXT: ldp q5, q4, [x2] -; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s -; NONEON-NOSVE-NEXT: neg v0.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: neg w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v5.16b ; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v4.16b +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %pre_cond = load <8 x i32>, ptr %pre_cond_ptr %left = load <8 x i32>, ptr %left_ptr diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll index d2bfc7d4e80969..a845c3cbdc2b6d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll @@ -44,7 +44,27 @@ define <8 x i8> @concat_v8i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <4 x i8> %op1, <4 x i8> %op2, <8 x i32> ret <8 x i8> %res @@ -62,9 +82,9 @@ define <16 x i8> @concat_v16i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <8 x i8> %op1, <8 x i8> %op2, <16 x i32> @@ -152,7 +172,17 @@ define <4 x i16> @concat_v4i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uzp1 v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w9, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <2 x i16> %op1, <2 x i16> %op2, <4 x i32> ret <4 x i16> %res @@ -171,9 +201,9 @@ define <8 x i16> @concat_v8i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <4 x i16> %op1, <4 x i16> %op2, <8 x i32> ret <8 x i16> %res @@ -243,7 +273,14 @@ define <2 x i32> @concat_v2i32(<1 x i32> %op1, <1 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <1 x i32> %op1, <1 x i32> %op2, <2 x i32> ret <2 x i32> %res @@ -262,9 +299,9 @@ define <4 x i32> @concat_v4i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <2 x i32> %op1, <2 x i32> %op2, <4 x i32> ret <4 x i32> %res @@ -332,9 +369,9 @@ define <2 x i64> @concat_v2i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <1 x i64> %op1, <1 x i64> %op2, <2 x i32> ret <2 x i64> %res @@ -407,7 +444,14 @@ define <4 x half> @concat_v4f16(<2 x half> %op1, <2 x half> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <2 x half> %op1, <2 x half> %op2, <4 x i32> ret <4 x half> %res @@ -425,9 +469,9 @@ define <8 x half> @concat_v8f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <4 x half> %op1, <4 x half> %op2, <8 x i32> ret <8 x half> %res @@ -497,7 +541,14 @@ define <2 x float> @concat_v2f32(<1 x float> %op1, <1 x float> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip1 v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <1 x float> %op1, <1 x float> %op2, <2 x i32> ret <2 x float> %res @@ -516,9 +567,9 @@ define <4 x float> @concat_v4f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <2 x float> %op1, <2 x float> %op2, <4 x i32> ret <4 x float> %res @@ -586,9 +637,9 @@ define <2 x double> @concat_v2f64(<1 x double> %op1, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: concat_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = shufflevector <1 x double> %op1, <1 x double> %op2, <2 x i32> ret <2 x double> %res @@ -732,7 +783,11 @@ define void @concat_v32i8_4op(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: concat_v32i8_4op: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i8>, ptr %a %shuffle = shufflevector <8 x i8> %op1, <8 x i8> undef, <16 x i32> , ptr %a %shuffle = shufflevector <4 x i16> %op1, <4 x i16> undef, <8 x i32> @@ -775,7 +834,11 @@ define void @concat_v8i32_4op(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: concat_v8i32_4op: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i32>, ptr %a %shuffle = shufflevector <2 x i32> %op1, <2 x i32> undef, <4 x i32> @@ -794,7 +857,11 @@ define void @concat_v4i64_4op(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: concat_v4i64_4op: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <1 x i64>, ptr %a %shuffle = shufflevector <1 x i64> %op1, <1 x i64> undef, <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll index 728b85d39bb37f..2cdd4374a56c5c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -15,8 +15,28 @@ define <8 x i16> @load_zext_v8i8i16(ptr %ap) { ; ; NONEON-NOSVE-LABEL: load_zext_v8i8i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i8>, ptr %ap %val = zext <8 x i8> %a to <8 x i16> @@ -33,8 +53,18 @@ define <4 x i32> @load_zext_v4i16i32(ptr %ap) { ; ; NONEON-NOSVE-LABEL: load_zext_v4i16i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i16>, ptr %ap %val = zext <4 x i16> %a to <4 x i32> @@ -51,8 +81,15 @@ define <2 x i64> @load_zext_v2i32i64(ptr %ap) { ; ; NONEON-NOSVE-LABEL: load_zext_v2i32i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <2 x i32>, ptr %ap %val = zext <2 x i32> %a to <2 x i64> @@ -77,13 +114,14 @@ define <2 x i256> @load_zext_v2i64i256(ptr %ap) { ; NONEON-NOSVE-LABEL: load_zext_v2i64i256: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp x0, x4, [sp], #16 ; NONEON-NOSVE-NEXT: mov x1, xzr ; NONEON-NOSVE-NEXT: mov x2, xzr ; NONEON-NOSVE-NEXT: mov x3, xzr ; NONEON-NOSVE-NEXT: mov x5, xzr ; NONEON-NOSVE-NEXT: mov x6, xzr -; NONEON-NOSVE-NEXT: mov x4, v0.d[1] -; NONEON-NOSVE-NEXT: fmov x0, d0 ; NONEON-NOSVE-NEXT: mov x7, xzr ; NONEON-NOSVE-NEXT: ret %a = load <2 x i64>, ptr %ap @@ -110,20 +148,75 @@ define <16 x i32> @load_sext_v16i8i32(ptr %ap) { ; ; NONEON-NOSVE-LABEL: load_sext_v16i8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: sshll v1.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v2.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v0.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #16] -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ldr d3, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d4, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v1.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v4.4h, #0 -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #94] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #128] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i8>, ptr %ap %val = sext <16 x i8> %a to <16 x i32> @@ -144,12 +237,24 @@ define <8 x i32> @load_sext_v8i16i32(ptr %ap) { ; NONEON-NOSVE-LABEL: load_sext_v8i16i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i16>, ptr %ap %val = sext <8 x i16> %a to <8 x i32> @@ -186,34 +291,31 @@ define <4 x i256> @load_sext_v4i32i256(ptr %ap) { ; NONEON-NOSVE-LABEL: load_sext_v4i32i256: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: add x10, x8, #32 -; NONEON-NOSVE-NEXT: add x11, x8, #96 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: mov x9, v0.d[1] -; NONEON-NOSVE-NEXT: st1 { v0.d }[1], [x10] -; NONEON-NOSVE-NEXT: fmov x10, d0 -; NONEON-NOSVE-NEXT: st1 { v1.d }[1], [x11] -; NONEON-NOSVE-NEXT: mov x11, v1.d[1] -; NONEON-NOSVE-NEXT: asr x10, x10, #63 -; NONEON-NOSVE-NEXT: str d0, [x8] -; NONEON-NOSVE-NEXT: asr x9, x9, #63 -; NONEON-NOSVE-NEXT: str d1, [x8, #64] -; NONEON-NOSVE-NEXT: stp x10, x10, [x8, #16] -; NONEON-NOSVE-NEXT: stp x9, x9, [x8, #48] -; NONEON-NOSVE-NEXT: str x9, [x8, #40] -; NONEON-NOSVE-NEXT: fmov x9, d1 -; NONEON-NOSVE-NEXT: str x10, [x8, #8] -; NONEON-NOSVE-NEXT: asr x10, x11, #63 -; NONEON-NOSVE-NEXT: asr x9, x9, #63 +; NONEON-NOSVE-NEXT: str q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldpsw x9, x10, [sp, #24] +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldpsw x9, x10, [sp, #16] +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldp x11, x9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp x12, x13, [sp, #80] +; NONEON-NOSVE-NEXT: asr x10, x9, #63 +; NONEON-NOSVE-NEXT: asr x14, x11, #63 ; NONEON-NOSVE-NEXT: stp x10, x10, [x8, #112] -; NONEON-NOSVE-NEXT: str x10, [x8, #104] -; NONEON-NOSVE-NEXT: stp x9, x9, [x8, #80] -; NONEON-NOSVE-NEXT: str x9, [x8, #72] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: stp x9, x10, [x8, #96] +; NONEON-NOSVE-NEXT: asr x9, x13, #63 +; NONEON-NOSVE-NEXT: asr x10, x12, #63 +; NONEON-NOSVE-NEXT: stp x14, x14, [x8, #80] +; NONEON-NOSVE-NEXT: stp x11, x14, [x8, #64] +; NONEON-NOSVE-NEXT: stp x9, x9, [x8, #48] +; NONEON-NOSVE-NEXT: stp x13, x9, [x8, #32] +; NONEON-NOSVE-NEXT: stp x10, x10, [x8, #16] +; NONEON-NOSVE-NEXT: stp x12, x10, [x8] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i32>, ptr %ap %val = sext <4 x i32> %a to <4 x i256> @@ -251,18 +353,26 @@ define <2 x i256> @load_sext_v2i64i256(ptr %ap) { ; ; NONEON-NOSVE-LABEL: load_sext_v2i64i256: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: dup v1.2d, v0.d[1] -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: asr x1, x0, #63 -; NONEON-NOSVE-NEXT: asr x5, x8, #63 -; NONEON-NOSVE-NEXT: mov x2, x1 -; NONEON-NOSVE-NEXT: mov x3, x1 -; NONEON-NOSVE-NEXT: mov v1.d[1], x5 -; NONEON-NOSVE-NEXT: mov x6, x5 -; NONEON-NOSVE-NEXT: mov x7, x5 -; NONEON-NOSVE-NEXT: fmov x4, d1 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: asr x8, x10, #63 +; NONEON-NOSVE-NEXT: stp x9, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp x10, x8, [sp, #64] +; NONEON-NOSVE-NEXT: stp x8, x8, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp x0, x1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp x2, x3, [sp, #80] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #112] +; NONEON-NOSVE-NEXT: ldp x4, x5, [sp, #128] +; NONEON-NOSVE-NEXT: ldp x6, x7, [sp, #112] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret %a = load <2 x i64>, ptr %ap %val = sext <2 x i64> %a to <2 x i256> @@ -300,30 +410,88 @@ define <16 x i64> @load_zext_v16i16i64(ptr %ap) { ; ; NONEON-NOSVE-LABEL: load_zext_v16i16i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #336 +; NONEON-NOSVE-NEXT: str x29, [sp, #320] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 336 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ushll v2.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v3.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v4.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ushll v5.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v0.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #32] -; NONEON-NOSVE-NEXT: ushll v2.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d6, [sp, #56] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #40] -; NONEON-NOSVE-NEXT: stp q5, q3, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d16, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d17, [sp, #72] -; NONEON-NOSVE-NEXT: ushll v1.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: ushll v6.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: ushll v5.2d, v16.2s, #0 -; NONEON-NOSVE-NEXT: ushll v7.2d, v17.2s, #0 -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: str wzr, [sp, #316] +; NONEON-NOSVE-NEXT: str wzr, [sp, #308] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #320] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: str wzr, [sp, #300] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: str wzr, [sp, #292] +; NONEON-NOSVE-NEXT: str wzr, [sp, #284] +; NONEON-NOSVE-NEXT: str wzr, [sp, #276] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: str wzr, [sp, #268] +; NONEON-NOSVE-NEXT: str wzr, [sp, #260] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #44] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #152] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #136] +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: str d1, [sp, #328] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #104] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #248] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #240] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #176] +; NONEON-NOSVE-NEXT: str d0, [sp, #168] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #232] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #224] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #224] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #216] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #208] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #192] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #332] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #200] +; NONEON-NOSVE-NEXT: str w8, [sp, #312] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #328] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #192] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: str w9, [sp, #296] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldp q5, q4, [sp, #288] +; NONEON-NOSVE-NEXT: str w9, [sp, #280] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: str w9, [sp, #264] +; NONEON-NOSVE-NEXT: str w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldp q7, q6, [sp, #256] +; NONEON-NOSVE-NEXT: add sp, sp, #336 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %ap %val = zext <16 x i16> %a to <16 x i64> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll index ec6341d6085a0a..b7b34cfa1517ce 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll @@ -31,7 +31,18 @@ define <4 x i1> @extract_subvector_v8i1(<8 x i1> %op) { ; ; NONEON-NOSVE-LABEL: extract_subvector_v8i1: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip2 v0.8b, v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %ret = call <4 x i1> @llvm.vector.extract.v4i1.v8i1(<8 x i1> %op, i64 4) ret <4 x i1> %ret @@ -63,7 +74,18 @@ define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) { ; ; NONEON-NOSVE-LABEL: extract_subvector_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip2 v0.8b, v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %ret = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> %op, i64 4) ret <4 x i8> %ret @@ -178,8 +200,12 @@ define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) { ; ; NONEON-NOSVE-LABEL: extract_subvector_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.2s, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: str w8, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1) ret <1 x i32> %ret @@ -275,8 +301,12 @@ define <2 x half> @extract_subvector_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: extract_subvector_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.2s, v0.s[1] +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2) ret <2 x half> %ret @@ -331,8 +361,12 @@ define <1 x float> @extract_subvector_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: extract_subvector_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.2s, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: str w8, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1) ret <1 x float> %ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll index ac60a614d7ce6c..0a1831a94d8fec 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll @@ -19,8 +19,11 @@ define half @extractelement_v2f16(<2 x half> %op1) { ; ; NONEON-NOSVE-LABEL: extractelement_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h0, v0.h[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = extractelement <2 x half> %op1, i64 1 ret half %r @@ -36,8 +39,11 @@ define half @extractelement_v4f16(<4 x half> %op1) { ; ; NONEON-NOSVE-LABEL: extractelement_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = extractelement <4 x half> %op1, i64 3 ret half %r @@ -53,7 +59,10 @@ define half @extractelement_v8f16(<8 x half> %op1) { ; ; NONEON-NOSVE-LABEL: extractelement_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = extractelement <8 x half> %op1, i64 7 ret half %r @@ -69,7 +78,11 @@ define half @extractelement_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: extractelement_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr h0, [x0, #30] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %r = extractelement <16 x half> %op1, i64 15 @@ -86,8 +99,11 @@ define float @extractelement_v2f32(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: extractelement_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov s0, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = extractelement <2 x float> %op1, i64 1 ret float %r @@ -103,7 +119,10 @@ define float @extractelement_v4f32(<4 x float> %op1) { ; ; NONEON-NOSVE-LABEL: extractelement_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov s0, v0.s[3] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = extractelement <4 x float> %op1, i64 3 ret float %r @@ -119,7 +138,11 @@ define float @extractelement_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: extractelement_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0, #28] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %r = extractelement <8 x float> %op1, i64 7 @@ -147,7 +170,10 @@ define double @extractelement_v2f64(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: extractelement_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov d0, v0.d[1] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = extractelement <2 x double> %op1, i64 1 ret double %r @@ -163,7 +189,11 @@ define double @extractelement_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: extractelement_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr d0, [x0, #24] +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %r = extractelement <4 x double> %op1, i64 3 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll index c1d84f6a15ed8c..a8d01ec7ce0b4b 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll @@ -32,12 +32,58 @@ define void @test_copysign_v4f16_v4f16(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d0, [x1] ; NONEON-NOSVE-NEXT: ldr d1, [x0] -; NONEON-NOSVE-NEXT: ldr d2, [x1] -; NONEON-NOSVE-NEXT: dup v0.4h, w8 -; NONEON-NOSVE-NEXT: bsl v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <4 x half>, ptr %ap %b = load <4 x half>, ptr %bp @@ -68,12 +114,102 @@ define void @test_copysign_v8f16_v8f16(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldr q0, [x1] ; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x1] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: bsl v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: str h0, [sp, #4] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: str h0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %a = load <8 x half>, ptr %ap %b = load <8 x half>, ptr %bp @@ -108,13 +244,191 @@ define void @test_copysign_v16f16_v16f16(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v16f16_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff -; NONEON-NOSVE-NEXT: ldp q1, q4, [x1] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] ; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: bit v1.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #96] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #126] +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #124] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #122] +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #120] +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #118] +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #116] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #114] +; NONEON-NOSVE-NEXT: str h0, [sp, #4] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #112] +; NONEON-NOSVE-NEXT: str h0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #94] +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #92] +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #90] +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #88] +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #86] +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #84] +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #82] +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #80] +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #110] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #158] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #108] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #156] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #106] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #154] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #104] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #152] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #102] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #150] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #100] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #148] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #98] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #146] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #96] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #144] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #78] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #142] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #76] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #140] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #74] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #138] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #72] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #136] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #70] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #134] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #68] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #132] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #66] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #130] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #64] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <16 x half>, ptr %ap %b = load <16 x half>, ptr %bp @@ -147,12 +461,26 @@ define void @test_copysign_v2f32_v2f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d0, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldr d1, [x0] -; NONEON-NOSVE-NEXT: ldr d2, [x1] -; NONEON-NOSVE-NEXT: fneg v0.2s, v0.2s -; NONEON-NOSVE-NEXT: bsl v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: ldr d1, [x1] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <2 x float>, ptr %ap %b = load <2 x float>, ptr %bp @@ -183,12 +511,37 @@ define void @test_copysign_v4f32_v4f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x1] -; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s -; NONEON-NOSVE-NEXT: bsl v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <4 x float>, ptr %ap %b = load <4 x float>, ptr %bp @@ -223,13 +576,63 @@ define void @test_copysign_v8f32_v8f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v8f32_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldp q1, q4, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s -; NONEON-NOSVE-NEXT: bit v1.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %a = load <8 x float>, ptr %ap %b = load <8 x float>, ptr %bp @@ -262,12 +665,25 @@ define void @test_copysign_v2f64_v2f64(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x1] -; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: bsl v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d2, d1, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <2 x double>, ptr %ap %b = load <2 x double>, ptr %bp @@ -302,13 +718,39 @@ define void @test_copysign_v4f64_v4f64(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldp q1, q4, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: bit v1.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d2, d1, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d2, d1, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %a = load <4 x double>, ptr %ap %b = load <4 x double>, ptr %bp @@ -347,13 +789,27 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v2f32_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d0, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: ldr d2, [x0] -; NONEON-NOSVE-NEXT: fcvtn v1.2s, v1.2d -; NONEON-NOSVE-NEXT: fneg v0.2s, v0.2s -; NONEON-NOSVE-NEXT: bsl v0.8b, v2.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d1, [x0] +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: str d1, [sp, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <2 x float>, ptr %ap %b = load <2 x double>, ptr %bp @@ -402,14 +858,39 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f32_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: fcvtn v1.2s, v1.2d -; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.4s, v2.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] ; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v1.16b +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s2, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %a = load <4 x float>, ptr %ap %b = load <4 x double>, ptr %bp @@ -447,13 +928,27 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v2f64_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: ldr d1, [x1] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v1.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: ldr d0, [x1] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: str d0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d2, d1, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <2 x double>, ptr %ap %b = load < 2 x float>, ptr %bp @@ -502,19 +997,41 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f64_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: movi v0.2d, #0xffffffffffffffff -; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtl v4.2d, v4.2s -; NONEON-NOSVE-NEXT: bit v1.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v3.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #16] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d2, d1, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d2, d1, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %a = load <4 x double>, ptr %ap %b = load <4 x float>, ptr %bp @@ -554,13 +1071,49 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d1, [x0] ; NONEON-NOSVE-NEXT: ldr q0, [x1] -; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff -; NONEON-NOSVE-NEXT: ldr d2, [x0] -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: bit v0.8b, v2.8b, v1.8b +; NONEON-NOSVE-NEXT: str d1, [sp, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <4 x half>, ptr %ap %b = load <4 x float>, ptr %bp @@ -620,21 +1173,49 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v4f16_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x1] -; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff -; NONEON-NOSVE-NEXT: mov d1, v0.d[1] -; NONEON-NOSVE-NEXT: fcvt h0, d0 -; NONEON-NOSVE-NEXT: fcvt h1, d1 -; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, d2 -; NONEON-NOSVE-NEXT: mov d2, v2.d[1] -; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, d2 +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] ; NONEON-NOSVE-NEXT: ldr d2, [x0] -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: dup v1.4h, w8 -; NONEON-NOSVE-NEXT: bit v0.8b, v2.8b, v1.8b +; NONEON-NOSVE-NEXT: str d2, [sp, #8] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst x9, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst x8, #0x8000000000000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %a = load <4 x half>, ptr %ap %b = load <4 x double>, ptr %bp @@ -682,14 +1263,83 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_v8f16_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: mov w8, #32767 // =0x7fff +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] ; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s -; NONEON-NOSVE-NEXT: dup v1.8h, w8 -; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v1.16b +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w9, #0x80000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: tst w8, #0x80000000 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %a = load <8 x half>, ptr %ap %b = load <8 x float>, ptr %bp diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll index b51b89d08844d0..e84acfc8504a95 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll @@ -21,10 +21,39 @@ define <2 x half> @fadd_v2f16(<2 x half> %op1, <2 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fadd_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fadd <2 x half> %op1, %op2 ret <2 x half> %res @@ -42,10 +71,39 @@ define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fadd_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fadd <4 x half> %op1, %op2 ret <4 x half> %res @@ -63,14 +121,66 @@ define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fadd_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fadd v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: fadd v1.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fadd <8 x half> %op1, %op2 ret <8 x half> %res @@ -90,25 +200,127 @@ define void @fadd_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fadd v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: fadd v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v2.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s -; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -129,7 +341,17 @@ define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fadd_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fadd v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fadd <2 x float> %op1, %op2 ret <2 x float> %res @@ -147,7 +369,22 @@ define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fadd_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fadd <4 x float> %op1, %op2 ret <4 x float> %res @@ -167,11 +404,39 @@ define void @fadd_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -192,7 +457,16 @@ define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fadd_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fadd v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fadd <2 x double> %op1, %op2 ret <2 x double> %res @@ -212,11 +486,27 @@ define void @fadd_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fadd v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fadd v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fadd d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -241,10 +531,39 @@ define <2 x half> @fdiv_v2f16(<2 x half> %op1, <2 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fdiv_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fdiv v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fdiv <2 x half> %op1, %op2 ret <2 x half> %res @@ -262,10 +581,39 @@ define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fdiv_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fdiv v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fdiv <4 x half> %op1, %op2 ret <4 x half> %res @@ -283,14 +631,66 @@ define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fdiv_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fdiv v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: fdiv v1.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fdiv <8 x half> %op1, %op2 ret <8 x half> %res @@ -310,26 +710,127 @@ define void @fdiv_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fdiv_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q4, q1, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v5.4s, v4.8h -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v4.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fdiv v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: ldr q3, [x0] -; NONEON-NOSVE-NEXT: fcvtl2 v6.4s, v3.8h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: fdiv v3.4s, v3.4s, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fdiv v5.4s, v6.4s, v5.4s -; NONEON-NOSVE-NEXT: fdiv v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q2, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -350,7 +851,17 @@ define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fdiv_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fdiv v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fdiv <2 x float> %op1, %op2 ret <2 x float> %res @@ -368,7 +879,22 @@ define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fdiv_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fdiv v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fdiv <4 x float> %op1, %op2 ret <4 x float> %res @@ -388,11 +914,39 @@ define void @fdiv_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fdiv_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fdiv v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fdiv v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fdiv s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fdiv s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -413,7 +967,16 @@ define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fdiv_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fdiv v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fdiv d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fdiv d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fdiv <2 x double> %op1, %op2 ret <2 x double> %res @@ -433,11 +996,27 @@ define void @fdiv_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fdiv_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fdiv v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fdiv v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fdiv d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fdiv d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fdiv d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fdiv d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -463,42 +1042,48 @@ define <2 x half> @fma_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3) ; ; NONEON-NOSVE-LABEL: fma_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d2 killed $d2 def $q2 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: fcvt s16, h0 -; NONEON-NOSVE-NEXT: mov h17, v2.h[2] -; NONEON-NOSVE-NEXT: mov h18, v1.h[2] -; NONEON-NOSVE-NEXT: mov h19, v0.h[2] -; NONEON-NOSVE-NEXT: mov h2, v2.h[3] -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fmadd s6, s16, s7, s6 -; NONEON-NOSVE-NEXT: mov h16, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s7, h19 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmadd s3, s5, s4, s3 -; NONEON-NOSVE-NEXT: fcvt s4, h17 -; NONEON-NOSVE-NEXT: fcvt s5, h18 -; NONEON-NOSVE-NEXT: fcvt h0, s6 -; NONEON-NOSVE-NEXT: fmadd s4, s7, s5, s4 -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h16 -; NONEON-NOSVE-NEXT: mov v0.h[1], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: fmadd s1, s5, s1, s2 -; NONEON-NOSVE-NEXT: mov v0.h[2], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h2, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.fma.v2f16(<2 x half> %op1, <2 x half> %op2, <2 x half> %op3) ret <2 x half> %res @@ -517,42 +1102,48 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) ; ; NONEON-NOSVE-LABEL: fma_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d2 killed $d2 def $q2 -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: fcvt s16, h0 -; NONEON-NOSVE-NEXT: mov h17, v2.h[2] -; NONEON-NOSVE-NEXT: mov h18, v1.h[2] -; NONEON-NOSVE-NEXT: mov h19, v0.h[2] -; NONEON-NOSVE-NEXT: mov h2, v2.h[3] -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fmadd s6, s16, s7, s6 -; NONEON-NOSVE-NEXT: mov h16, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s7, h19 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmadd s3, s5, s4, s3 -; NONEON-NOSVE-NEXT: fcvt s4, h17 -; NONEON-NOSVE-NEXT: fcvt s5, h18 -; NONEON-NOSVE-NEXT: fcvt h0, s6 -; NONEON-NOSVE-NEXT: fmadd s4, s7, s5, s4 -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h16 -; NONEON-NOSVE-NEXT: mov v0.h[1], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: fmadd s1, s5, s1, s2 -; NONEON-NOSVE-NEXT: mov v0.h[2], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h2, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) ret <4 x half> %res @@ -571,75 +1162,84 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) ; ; NONEON-NOSVE-LABEL: fma_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h3, v2.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: fcvt s16, h0 -; NONEON-NOSVE-NEXT: mov h17, v2.h[2] -; NONEON-NOSVE-NEXT: mov h18, v1.h[2] -; NONEON-NOSVE-NEXT: mov h19, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fmadd s6, s16, s7, s6 -; NONEON-NOSVE-NEXT: fcvt s7, h17 -; NONEON-NOSVE-NEXT: fcvt s16, h18 -; NONEON-NOSVE-NEXT: fcvt s17, h19 -; NONEON-NOSVE-NEXT: mov h18, v1.h[3] -; NONEON-NOSVE-NEXT: mov h19, v0.h[3] -; NONEON-NOSVE-NEXT: fmadd s4, s5, s4, s3 -; NONEON-NOSVE-NEXT: mov h5, v2.h[3] -; NONEON-NOSVE-NEXT: fcvt h3, s6 -; NONEON-NOSVE-NEXT: fmadd s6, s17, s16, s7 -; NONEON-NOSVE-NEXT: mov h17, v2.h[4] -; NONEON-NOSVE-NEXT: fcvt s7, h18 -; NONEON-NOSVE-NEXT: fcvt s16, h19 -; NONEON-NOSVE-NEXT: mov h18, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: mov h19, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: mov v3.h[1], v4.h[0] -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: fmadd s5, s16, s7, s5 -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: mov h16, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: mov v3.h[2], v6.h[0] -; NONEON-NOSVE-NEXT: mov h6, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt h5, s5 -; NONEON-NOSVE-NEXT: fmadd s17, s19, s18, s17 -; NONEON-NOSVE-NEXT: mov h18, v1.h[6] -; NONEON-NOSVE-NEXT: mov h19, v0.h[6] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fmadd s4, s16, s7, s4 -; NONEON-NOSVE-NEXT: mov v3.h[3], v5.h[0] -; NONEON-NOSVE-NEXT: fcvt s5, h6 -; NONEON-NOSVE-NEXT: fcvt s6, h18 -; NONEON-NOSVE-NEXT: fcvt s7, h19 -; NONEON-NOSVE-NEXT: fcvt h16, s17 +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h2, [sp] ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fmadd s5, s7, s6, s5 -; NONEON-NOSVE-NEXT: mov v3.h[4], v16.h[0] -; NONEON-NOSVE-NEXT: fmadd s0, s0, s1, s2 -; NONEON-NOSVE-NEXT: mov v3.h[5], v4.h[0] -; NONEON-NOSVE-NEXT: fcvt h4, s5 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v3.h[6], v4.h[0] -; NONEON-NOSVE-NEXT: mov v3.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: mov v0.16b, v3.16b +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) ret <8 x half> %res @@ -660,146 +1260,161 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fma_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q3, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q4, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q5, q2, [x2] -; NONEON-NOSVE-NEXT: mov h25, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s19, h0 -; NONEON-NOSVE-NEXT: mov h24, v0.h[2] -; NONEON-NOSVE-NEXT: mov h17, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s18, h1 -; NONEON-NOSVE-NEXT: mov h22, v1.h[2] -; NONEON-NOSVE-NEXT: mov h16, v2.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: mov h20, v2.h[2] -; NONEON-NOSVE-NEXT: mov h26, v5.h[1] -; NONEON-NOSVE-NEXT: mov h27, v4.h[1] -; NONEON-NOSVE-NEXT: mov h28, v3.h[1] -; NONEON-NOSVE-NEXT: fcvt s25, h25 -; NONEON-NOSVE-NEXT: mov h7, v2.h[3] -; NONEON-NOSVE-NEXT: mov h29, v4.h[2] -; NONEON-NOSVE-NEXT: fcvt s23, h17 -; NONEON-NOSVE-NEXT: mov h17, v0.h[3] -; NONEON-NOSVE-NEXT: mov h30, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s21, h16 -; NONEON-NOSVE-NEXT: fmadd s6, s19, s18, s6 -; NONEON-NOSVE-NEXT: fcvt s18, h20 -; NONEON-NOSVE-NEXT: fcvt s19, h22 -; NONEON-NOSVE-NEXT: fcvt s20, h24 -; NONEON-NOSVE-NEXT: mov h16, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s22, h5 -; NONEON-NOSVE-NEXT: fcvt s24, h4 -; NONEON-NOSVE-NEXT: fcvt s26, h26 -; NONEON-NOSVE-NEXT: fcvt s27, h27 -; NONEON-NOSVE-NEXT: fcvt s28, h28 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fmadd s21, s25, s23, s21 -; NONEON-NOSVE-NEXT: fcvt s23, h3 -; NONEON-NOSVE-NEXT: mov h25, v5.h[2] -; NONEON-NOSVE-NEXT: fmadd s18, s20, s19, s18 -; NONEON-NOSVE-NEXT: mov h19, v3.h[2] -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: mov h31, v0.h[4] -; NONEON-NOSVE-NEXT: fmadd s26, s28, s27, s26 -; NONEON-NOSVE-NEXT: mov h27, v4.h[3] -; NONEON-NOSVE-NEXT: mov h28, v3.h[3] -; NONEON-NOSVE-NEXT: fmadd s22, s23, s24, s22 -; NONEON-NOSVE-NEXT: fcvt h20, s21 -; NONEON-NOSVE-NEXT: mov h21, v2.h[4] -; NONEON-NOSVE-NEXT: fcvt s23, h25 -; NONEON-NOSVE-NEXT: fcvt s24, h29 -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: fmadd s16, s17, s16, s7 -; NONEON-NOSVE-NEXT: mov h25, v5.h[3] -; NONEON-NOSVE-NEXT: fcvt h18, s18 -; NONEON-NOSVE-NEXT: fcvt h26, s26 -; NONEON-NOSVE-NEXT: mov h29, v2.h[5] -; NONEON-NOSVE-NEXT: mov v6.h[1], v20.h[0] -; NONEON-NOSVE-NEXT: fcvt s17, h21 -; NONEON-NOSVE-NEXT: fcvt s20, h30 -; NONEON-NOSVE-NEXT: fmadd s19, s19, s24, s23 -; NONEON-NOSVE-NEXT: fcvt s21, h31 -; NONEON-NOSVE-NEXT: fcvt h7, s22 -; NONEON-NOSVE-NEXT: fcvt s22, h25 -; NONEON-NOSVE-NEXT: fcvt s23, h27 -; NONEON-NOSVE-NEXT: fcvt s24, h28 -; NONEON-NOSVE-NEXT: mov h25, v5.h[4] -; NONEON-NOSVE-NEXT: mov h27, v4.h[4] -; NONEON-NOSVE-NEXT: mov h28, v3.h[4] -; NONEON-NOSVE-NEXT: mov h30, v1.h[5] -; NONEON-NOSVE-NEXT: mov h31, v0.h[5] -; NONEON-NOSVE-NEXT: mov v6.h[2], v18.h[0] -; NONEON-NOSVE-NEXT: fmadd s17, s21, s20, s17 -; NONEON-NOSVE-NEXT: mov v7.h[1], v26.h[0] -; NONEON-NOSVE-NEXT: fcvt h18, s19 -; NONEON-NOSVE-NEXT: fmadd s19, s24, s23, s22 -; NONEON-NOSVE-NEXT: mov h26, v5.h[5] -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt s20, h25 -; NONEON-NOSVE-NEXT: fcvt s21, h27 -; NONEON-NOSVE-NEXT: fcvt s22, h28 -; NONEON-NOSVE-NEXT: mov h27, v4.h[5] -; NONEON-NOSVE-NEXT: mov h28, v3.h[5] -; NONEON-NOSVE-NEXT: fcvt s23, h29 -; NONEON-NOSVE-NEXT: fcvt s24, h30 -; NONEON-NOSVE-NEXT: fcvt s25, h31 -; NONEON-NOSVE-NEXT: mov h29, v2.h[6] -; NONEON-NOSVE-NEXT: mov h30, v1.h[6] -; NONEON-NOSVE-NEXT: mov h31, v0.h[6] -; NONEON-NOSVE-NEXT: mov v7.h[2], v18.h[0] -; NONEON-NOSVE-NEXT: fcvt h18, s19 -; NONEON-NOSVE-NEXT: fmadd s19, s22, s21, s20 -; NONEON-NOSVE-NEXT: mov h20, v5.h[6] -; NONEON-NOSVE-NEXT: mov h21, v4.h[6] -; NONEON-NOSVE-NEXT: mov h22, v3.h[6] -; NONEON-NOSVE-NEXT: fcvt s26, h26 -; NONEON-NOSVE-NEXT: fmadd s23, s25, s24, s23 -; NONEON-NOSVE-NEXT: fcvt s27, h27 -; NONEON-NOSVE-NEXT: fcvt s28, h28 -; NONEON-NOSVE-NEXT: mov v6.h[3], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s17 -; NONEON-NOSVE-NEXT: fcvt s17, h29 -; NONEON-NOSVE-NEXT: fcvt s24, h30 -; NONEON-NOSVE-NEXT: fcvt s25, h31 -; NONEON-NOSVE-NEXT: fcvt s20, h20 -; NONEON-NOSVE-NEXT: fcvt s21, h21 -; NONEON-NOSVE-NEXT: fcvt s22, h22 -; NONEON-NOSVE-NEXT: mov v7.h[3], v18.h[0] -; NONEON-NOSVE-NEXT: fmadd s26, s28, s27, s26 -; NONEON-NOSVE-NEXT: fcvt h18, s19 -; NONEON-NOSVE-NEXT: mov h5, v5.h[7] -; NONEON-NOSVE-NEXT: mov h4, v4.h[7] -; NONEON-NOSVE-NEXT: mov h3, v3.h[7] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: fmadd s17, s25, s24, s17 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fmadd s19, s22, s21, s20 -; NONEON-NOSVE-NEXT: mov v6.h[4], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s23 -; NONEON-NOSVE-NEXT: mov v7.h[4], v18.h[0] -; NONEON-NOSVE-NEXT: fcvt h18, s26 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q4, q2, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #94] +; NONEON-NOSVE-NEXT: stp q1, q5, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #126] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #92] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #124] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #90] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #56] ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #122] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #88] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v6.h[5], v16.h[0] -; NONEON-NOSVE-NEXT: mov v7.h[5], v18.h[0] -; NONEON-NOSVE-NEXT: fmadd s3, s3, s4, s5 -; NONEON-NOSVE-NEXT: fcvt h4, s19 -; NONEON-NOSVE-NEXT: fcvt h5, s17 -; NONEON-NOSVE-NEXT: fmadd s0, s0, s1, s2 -; NONEON-NOSVE-NEXT: mov v7.h[6], v4.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s3 -; NONEON-NOSVE-NEXT: mov v6.h[6], v5.h[0] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v7.h[7], v1.h[0] -; NONEON-NOSVE-NEXT: mov v6.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: stp q7, q6, [x0] +; NONEON-NOSVE-NEXT: str h0, [sp, #120] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #86] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #118] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #84] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #116] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #82] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #114] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #80] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #112] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #110] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #108] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #106] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #104] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #102] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #100] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h2, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #98] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -822,8 +1437,19 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o ; ; NONEON-NOSVE-LABEL: fma_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmla v2.2s, v1.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) ret <2 x float> %res @@ -842,8 +1468,26 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o ; ; NONEON-NOSVE-LABEL: fma_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmla v2.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) ret <4 x float> %res @@ -864,12 +1508,45 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fma_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q4, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q5, [x2] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fmla v1.4s, v0.4s, v2.4s -; NONEON-NOSVE-NEXT: fmla v5.4s, v4.4s, v3.4s -; NONEON-NOSVE-NEXT: stp q1, q5, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q4, q2, [sp] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #92] +; NONEON-NOSVE-NEXT: stp q1, q5, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #56] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #88] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #48] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #120] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #84] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #80] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #104] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -892,8 +1569,19 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double ; ; NONEON-NOSVE-LABEL: fma_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmla v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d2, d4, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd d5, d4, d3, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d5, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) ret <2 x double> %res @@ -914,12 +1602,31 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fma_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q4, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q5, [x2] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fmla v1.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: fmla v5.2d, v4.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q1, q5, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q4, q2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #88] +; NONEON-NOSVE-NEXT: stp q1, q5, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp d2, d4, [sp, #48] +; NONEON-NOSVE-NEXT: fmadd d5, d4, d3, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #80] +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d2, d4, [sp] +; NONEON-NOSVE-NEXT: stp d0, d5, [sp, #112] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd d5, d4, d3, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d5, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -945,10 +1652,39 @@ define <2 x half> @fmul_v2f16(<2 x half> %op1, <2 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmul_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fmul <2 x half> %op1, %op2 ret <2 x half> %res @@ -966,10 +1702,39 @@ define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmul_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fmul <4 x half> %op1, %op2 ret <4 x half> %res @@ -987,14 +1752,66 @@ define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmul_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fmul v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: fmul v1.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fmul <8 x half> %op1, %op2 ret <8 x half> %res @@ -1014,25 +1831,127 @@ define void @fmul_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmul_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fmul v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: fmul v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: fmul v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmul v2.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s -; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1053,7 +1972,17 @@ define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmul_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmul v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fmul <2 x float> %op1, %op2 ret <2 x float> %res @@ -1071,7 +2000,22 @@ define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmul_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fmul <4 x float> %op1, %op2 ret <4 x float> %res @@ -1091,11 +2035,39 @@ define void @fmul_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmul_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmul v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmul v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmul s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -1116,7 +2088,16 @@ define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmul_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmul v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmul d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmul d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fmul <2 x double> %op1, %op2 ret <2 x double> %res @@ -1136,11 +2117,27 @@ define void @fmul_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmul_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmul v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fmul v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fmul d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fmul d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmul d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmul d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -1164,8 +2161,30 @@ define <2 x half> @fneg_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: fneg_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.4h, #128, lsl #8 -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fneg <2 x half> %op ret <2 x half> %res @@ -1182,8 +2201,30 @@ define <4 x half> @fneg_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: fneg_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.4h, #128, lsl #8 -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fneg <4 x half> %op ret <4 x half> %res @@ -1200,8 +2241,50 @@ define <8 x half> @fneg_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: fneg_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v1.8h, #128, lsl #8 -; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fneg <8 x half> %op ret <8 x half> %res @@ -1219,11 +2302,92 @@ define void @fneg_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fneg_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.8h, #128, lsl #8 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: eor w8, w8, #0x8000 +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = fneg <16 x half> %op @@ -1242,7 +2406,15 @@ define <2 x float> @fneg_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: fneg_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fneg v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fneg <2 x float> %op ret <2 x float> %res @@ -1259,7 +2431,20 @@ define <4 x float> @fneg_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: fneg_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fneg <4 x float> %op ret <4 x float> %res @@ -1277,10 +2462,32 @@ define void @fneg_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: fneg_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fneg v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fneg v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fneg s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fneg s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = fneg <8 x float> %op @@ -1299,7 +2506,15 @@ define <2 x double> @fneg_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: fneg_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fneg d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fneg <2 x double> %op ret <2 x double> %res @@ -1317,10 +2532,22 @@ define void @fneg_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: fneg_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fneg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fneg v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fneg d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fneg d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fneg d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = fneg <4 x double> %op @@ -1343,26 +2570,30 @@ define <2 x half> @fsqrt_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: fsqrt_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 -; NONEON-NOSVE-NEXT: mov h3, v0.h[2] -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fsqrt s2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fsqrt s1, s1 -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fsqrt s3, s3 -; NONEON-NOSVE-NEXT: fsqrt s4, s0 -; NONEON-NOSVE-NEXT: fcvt h0, s2 -; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s3 -; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s4 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %op) ret <2 x half> %res @@ -1379,26 +2610,30 @@ define <4 x half> @fsqrt_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: fsqrt_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 -; NONEON-NOSVE-NEXT: mov h3, v0.h[2] -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fsqrt s2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fsqrt s1, s1 -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fsqrt s3, s3 -; NONEON-NOSVE-NEXT: fsqrt s4, s0 -; NONEON-NOSVE-NEXT: fcvt h0, s2 -; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s3 -; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s4 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op) ret <4 x half> %res @@ -1415,44 +2650,50 @@ define <8 x half> @fsqrt_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: fsqrt_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 -; NONEON-NOSVE-NEXT: mov h3, v0.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[3] -; NONEON-NOSVE-NEXT: mov h5, v0.h[4] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: mov h7, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fsqrt s2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h0 -; NONEON-NOSVE-NEXT: fcvt h0, s2 -; NONEON-NOSVE-NEXT: fsqrt s1, s1 -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] -; NONEON-NOSVE-NEXT: fsqrt s3, s3 -; NONEON-NOSVE-NEXT: fcvt h1, s3 -; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] -; NONEON-NOSVE-NEXT: fsqrt s4, s4 -; NONEON-NOSVE-NEXT: fcvt h1, s4 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: fsqrt s5, s5 -; NONEON-NOSVE-NEXT: fcvt h1, s5 -; NONEON-NOSVE-NEXT: mov v0.h[4], v1.h[0] -; NONEON-NOSVE-NEXT: fsqrt s6, s6 -; NONEON-NOSVE-NEXT: fcvt h1, s6 -; NONEON-NOSVE-NEXT: mov v0.h[5], v1.h[0] -; NONEON-NOSVE-NEXT: fsqrt s7, s7 -; NONEON-NOSVE-NEXT: fcvt h1, s7 -; NONEON-NOSVE-NEXT: mov v0.h[6], v1.h[0] -; NONEON-NOSVE-NEXT: fsqrt s2, s16 -; NONEON-NOSVE-NEXT: fcvt h1, s2 -; NONEON-NOSVE-NEXT: mov v0.h[7], v1.h[0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op) ret <8 x half> %res @@ -1470,85 +2711,92 @@ define void @fsqrt_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fsqrt_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q16, [x0] -; NONEON-NOSVE-NEXT: mov h0, v1.h[1] -; NONEON-NOSVE-NEXT: mov h17, v16.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s18, h16 -; NONEON-NOSVE-NEXT: mov h19, v16.h[2] -; NONEON-NOSVE-NEXT: mov h4, v1.h[3] -; NONEON-NOSVE-NEXT: mov h20, v16.h[3] -; NONEON-NOSVE-NEXT: mov h5, v1.h[4] -; NONEON-NOSVE-NEXT: mov h21, v16.h[4] -; NONEON-NOSVE-NEXT: mov h6, v1.h[5] -; NONEON-NOSVE-NEXT: mov h22, v16.h[5] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fsqrt s2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: mov h7, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s20, h20 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s21, h21 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s22, h22 -; NONEON-NOSVE-NEXT: mov h23, v16.h[6] -; NONEON-NOSVE-NEXT: mov h16, v16.h[7] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s23, h23 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fsqrt s0, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v2.h[1], v0.h[0] -; NONEON-NOSVE-NEXT: fsqrt s17, s17 -; NONEON-NOSVE-NEXT: fcvt h17, s17 -; NONEON-NOSVE-NEXT: fsqrt s18, s18 -; NONEON-NOSVE-NEXT: fcvt h18, s18 -; NONEON-NOSVE-NEXT: mov v18.h[1], v17.h[0] -; NONEON-NOSVE-NEXT: fsqrt s3, s3 -; NONEON-NOSVE-NEXT: fcvt h0, s3 -; NONEON-NOSVE-NEXT: mov v2.h[2], v0.h[0] -; NONEON-NOSVE-NEXT: fsqrt s19, s19 -; NONEON-NOSVE-NEXT: fcvt h17, s19 -; NONEON-NOSVE-NEXT: mov v18.h[2], v17.h[0] -; NONEON-NOSVE-NEXT: fsqrt s4, s4 -; NONEON-NOSVE-NEXT: fcvt h0, s4 -; NONEON-NOSVE-NEXT: mov v2.h[3], v0.h[0] -; NONEON-NOSVE-NEXT: fsqrt s20, s20 -; NONEON-NOSVE-NEXT: fcvt h3, s20 -; NONEON-NOSVE-NEXT: mov v18.h[3], v3.h[0] -; NONEON-NOSVE-NEXT: fsqrt s5, s5 -; NONEON-NOSVE-NEXT: fcvt h0, s5 -; NONEON-NOSVE-NEXT: mov v2.h[4], v0.h[0] -; NONEON-NOSVE-NEXT: fsqrt s21, s21 -; NONEON-NOSVE-NEXT: fcvt h3, s21 -; NONEON-NOSVE-NEXT: mov v18.h[4], v3.h[0] -; NONEON-NOSVE-NEXT: fsqrt s6, s6 -; NONEON-NOSVE-NEXT: fcvt h0, s6 -; NONEON-NOSVE-NEXT: mov v2.h[5], v0.h[0] -; NONEON-NOSVE-NEXT: fsqrt s22, s22 -; NONEON-NOSVE-NEXT: fcvt h3, s22 -; NONEON-NOSVE-NEXT: mov v18.h[5], v3.h[0] -; NONEON-NOSVE-NEXT: fsqrt s7, s7 -; NONEON-NOSVE-NEXT: fcvt h0, s7 -; NONEON-NOSVE-NEXT: mov v2.h[6], v0.h[0] -; NONEON-NOSVE-NEXT: fsqrt s23, s23 -; NONEON-NOSVE-NEXT: fcvt h3, s23 -; NONEON-NOSVE-NEXT: mov v18.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: fsqrt s16, s16 -; NONEON-NOSVE-NEXT: fcvt h3, s16 -; NONEON-NOSVE-NEXT: mov v18.h[7], v3.h[0] -; NONEON-NOSVE-NEXT: fsqrt s1, s1 -; NONEON-NOSVE-NEXT: fcvt h0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: stp q18, q2, [x0] +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op) @@ -1567,7 +2815,15 @@ define <2 x float> @fsqrt_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: fsqrt_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fsqrt v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op) ret <2 x float> %res @@ -1584,7 +2840,20 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: fsqrt_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fsqrt v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op) ret <4 x float> %res @@ -1602,10 +2871,32 @@ define void @fsqrt_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: fsqrt_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fsqrt v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fsqrt v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fsqrt s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fsqrt s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op) @@ -1624,7 +2915,15 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: fsqrt_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fsqrt v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fsqrt d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fsqrt d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op) ret <2 x double> %res @@ -1642,10 +2941,22 @@ define void @fsqrt_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: fsqrt_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fsqrt v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fsqrt v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fsqrt d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fsqrt d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fsqrt d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fsqrt d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op) @@ -1669,10 +2980,39 @@ define <2 x half> @fsub_v2f16(<2 x half> %op1, <2 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fsub_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fsub v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fsub <2 x half> %op1, %op2 ret <2 x half> %res @@ -1690,10 +3030,39 @@ define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fsub_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fsub v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fsub <4 x half> %op1, %op2 ret <4 x half> %res @@ -1711,14 +3080,66 @@ define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fsub_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fsub v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: fsub v1.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fsub <8 x half> %op1, %op2 ret <8 x half> %res @@ -1738,25 +3159,127 @@ define void @fsub_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fsub_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fsub v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: fsub v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: fsub v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fsub v2.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s -; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1777,7 +3300,17 @@ define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fsub_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fsub v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fsub <2 x float> %op1, %op2 ret <2 x float> %res @@ -1795,7 +3328,22 @@ define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fsub_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fsub v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fsub <4 x float> %op1, %op2 ret <4 x float> %res @@ -1815,11 +3363,39 @@ define void @fsub_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fsub_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fsub v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fsub v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fsub s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fsub s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -1840,7 +3416,16 @@ define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fsub_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fsub v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fsub d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fsub d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = fsub <2 x double> %op1, %op2 ret <2 x double> %res @@ -1860,11 +3445,27 @@ define void @fsub_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fsub_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fsub v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fsub v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fsub d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fsub d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fsub d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fsub d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -1888,7 +3489,30 @@ define <2 x half> @fabs_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: fabs_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: bic v0.4h, #128, lsl #8 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.fabs.v2f16(<2 x half> %op) ret <2 x half> %res @@ -1905,7 +3529,30 @@ define <4 x half> @fabs_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: fabs_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: bic v0.4h, #128, lsl #8 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op) ret <4 x half> %res @@ -1922,7 +3569,50 @@ define <8 x half> @fabs_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: fabs_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: bic v0.8h, #128, lsl #8 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op) ret <8 x half> %res @@ -1940,10 +3630,92 @@ define void @fabs_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fabs_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: bic v0.8h, #128, lsl #8 -; NONEON-NOSVE-NEXT: bic v1.8h, #128, lsl #8 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: and w8, w8, #0x7fff +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op) @@ -1962,7 +3734,15 @@ define <2 x float> @fabs_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: fabs_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fabs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op) ret <2 x float> %res @@ -1979,7 +3759,20 @@ define <4 x float> @fabs_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: fabs_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fabs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op) ret <4 x float> %res @@ -1997,10 +3790,32 @@ define void @fabs_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: fabs_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fabs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fabs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fabs s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fabs s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op) @@ -2019,7 +3834,15 @@ define <2 x double> @fabs_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: fabs_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fabs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fabs d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op) ret <2 x double> %res @@ -2037,10 +3860,22 @@ define void @fabs_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: fabs_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fabs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fabs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fabs d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fabs d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fabs d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll index c5ed70c8a5f2f8..776b6918923ae9 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll @@ -23,10 +23,24 @@ define <2 x i16> @fcmp_oeq_v2f16(<2 x half> %op1, <2 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <2 x half> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i16> @@ -46,10 +60,39 @@ define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <4 x half> %op1, %op2 %sext = sext <4 x i1> %cmp to <4 x i16> @@ -69,61 +112,66 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcmp s3, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h6 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: mov h4, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov h6, v0.h[4] -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: fcmp s2, s5 -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h5, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[5] -; NONEON-NOSVE-NEXT: mov h4, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[6] -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: fcvt s3, h5 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <8 x half> %op1, %op2 %sext = sext <8 x i1> %cmp to <8 x i16> @@ -145,119 +193,127 @@ define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, eq -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, eq -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, eq -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, eq -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, eq -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, eq -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -280,7 +336,18 @@ define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcmeq v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #8] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <2 x float> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i32> @@ -300,7 +367,24 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #8] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <4 x float> %op1, %op2 %sext = sext <4 x i1> %cmp to <4 x i32> @@ -322,11 +406,43 @@ define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fcmeq v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #56] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #40] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #48] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -347,7 +463,13 @@ define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v1f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcmeq d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fcmp d0, d1 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <1 x double> %op1, %op2 %sext = sext <1 x i1> %cmp to <1 x i64> @@ -367,7 +489,17 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcmeq v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp] +; NONEON-NOSVE-NEXT: fcmp d3, d2 +; NONEON-NOSVE-NEXT: csetm x9, eq +; NONEON-NOSVE-NEXT: fcmp d1, d0 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = fcmp oeq <2 x double> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i64> @@ -389,11 +521,29 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_oeq_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcmeq v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fcmeq v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #32] +; NONEON-NOSVE-NEXT: fcmp d3, d2 +; NONEON-NOSVE-NEXT: csetm x9, eq +; NONEON-NOSVE-NEXT: fcmp d1, d0 +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp] +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp d3, d2 +; NONEON-NOSVE-NEXT: csetm x9, eq +; NONEON-NOSVE-NEXT: fcmp d1, d0 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -426,135 +576,143 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ueq_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h2 -; NONEON-NOSVE-NEXT: mov h5, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: mov h7, v1.h[2] -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s6, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[4] -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: csinv w12, w9, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s7, s5 -; NONEON-NOSVE-NEXT: mov h5, v2.h[5] -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: csinv w10, w9, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x1] -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: csinv w11, w9, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s6, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[7] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s6, h16 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: csinv w9, w9, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s7, s5 -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: mov h7, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w13, eq -; NONEON-NOSVE-NEXT: csinv w13, w13, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s6, s3 -; NONEON-NOSVE-NEXT: fcvt s3, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h7 -; NONEON-NOSVE-NEXT: mov h6, v0.h[2] -; NONEON-NOSVE-NEXT: mov h7, v1.h[2] -; NONEON-NOSVE-NEXT: csetm w14, eq -; NONEON-NOSVE-NEXT: csinv w14, w14, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s4, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h0 -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w15, eq -; NONEON-NOSVE-NEXT: csinv w15, w15, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s5, s3 -; NONEON-NOSVE-NEXT: mov h3, v0.h[3] -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: csetm w16, eq -; NONEON-NOSVE-NEXT: csinv w16, w16, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s4, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h3 -; NONEON-NOSVE-NEXT: fmov s2, w12 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w17, eq -; NONEON-NOSVE-NEXT: csinv w17, w17, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[4] -; NONEON-NOSVE-NEXT: fmov s3, w17 -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v3.h[1], w16 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v0.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov v2.h[2], w10 -; NONEON-NOSVE-NEXT: mov v3.h[2], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: mov h7, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w11 -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov v3.h[3], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: fcvt s5, h7 -; NONEON-NOSVE-NEXT: mov v2.h[4], w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v3.h[4], w8 ; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov v2.h[5], w13 -; NONEON-NOSVE-NEXT: mov v3.h[5], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] ; NONEON-NOSVE-NEXT: fcmp s1, s0 -; NONEON-NOSVE-NEXT: mov v2.h[6], w14 -; NONEON-NOSVE-NEXT: mov v3.h[6], w8 ; NONEON-NOSVE-NEXT: csetm w8, eq ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, vc -; NONEON-NOSVE-NEXT: mov v2.h[7], w15 -; NONEON-NOSVE-NEXT: mov v3.h[7], w8 -; NONEON-NOSVE-NEXT: stp q3, q2, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -587,150 +745,158 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_one_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h2 -; NONEON-NOSVE-NEXT: mov h5, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: mov h7, v1.h[2] -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le -; NONEON-NOSVE-NEXT: fcmp s6, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[4] -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w9, mi -; NONEON-NOSVE-NEXT: csinv w12, w9, wzr, le -; NONEON-NOSVE-NEXT: fcmp s7, s5 -; NONEON-NOSVE-NEXT: mov h5, v2.h[5] -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w9, mi -; NONEON-NOSVE-NEXT: csinv w10, w9, wzr, le -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x1] -; NONEON-NOSVE-NEXT: csetm w9, mi -; NONEON-NOSVE-NEXT: csinv w11, w9, wzr, le -; NONEON-NOSVE-NEXT: fcmp s6, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[7] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s6, h16 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w9, mi -; NONEON-NOSVE-NEXT: csinv w9, w9, wzr, le -; NONEON-NOSVE-NEXT: fcmp s7, s5 -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: mov h7, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w13, mi -; NONEON-NOSVE-NEXT: csinv w13, w13, wzr, le -; NONEON-NOSVE-NEXT: fcmp s6, s3 -; NONEON-NOSVE-NEXT: fcvt s3, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h7 -; NONEON-NOSVE-NEXT: mov h6, v0.h[2] -; NONEON-NOSVE-NEXT: mov h7, v1.h[2] -; NONEON-NOSVE-NEXT: csetm w14, mi -; NONEON-NOSVE-NEXT: csinv w14, w14, wzr, le -; NONEON-NOSVE-NEXT: fcmp s4, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h0 -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w15, mi -; NONEON-NOSVE-NEXT: csinv w15, w15, wzr, le -; NONEON-NOSVE-NEXT: fcmp s5, s3 -; NONEON-NOSVE-NEXT: mov h3, v0.h[3] -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: csetm w16, mi -; NONEON-NOSVE-NEXT: csinv w16, w16, wzr, le -; NONEON-NOSVE-NEXT: fcmp s4, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h3 -; NONEON-NOSVE-NEXT: fmov s2, w12 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w17, mi -; NONEON-NOSVE-NEXT: csinv w17, w17, wzr, le -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[4] -; NONEON-NOSVE-NEXT: fmov s3, w17 -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: mov v3.h[1], w16 ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v0.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov v2.h[2], w10 -; NONEON-NOSVE-NEXT: mov v3.h[2], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: mov h7, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w11 -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov v3.h[3], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: fcvt s5, h7 -; NONEON-NOSVE-NEXT: mov v2.h[4], w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v3.h[4], w8 ; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov v2.h[5], w13 -; NONEON-NOSVE-NEXT: mov v3.h[5], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] ; NONEON-NOSVE-NEXT: fcmp s1, s0 -; NONEON-NOSVE-NEXT: mov v2.h[6], w14 -; NONEON-NOSVE-NEXT: mov v3.h[6], w8 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi ; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le -; NONEON-NOSVE-NEXT: mov v2.h[7], w15 -; NONEON-NOSVE-NEXT: mov v3.h[7], w8 -; NONEON-NOSVE-NEXT: stp q3, q2, [x2] -; NONEON-NOSVE-NEXT: ret - %op1 = load <16 x half>, ptr %a - %op2 = load <16 x half>, ptr %b - %cmp = fcmp one <16 x half> %op1, %op2 - %sext = sext <16 x i1> %cmp to <16 x i16> - store <16 x i16> %sext, ptr %c - ret void -} - -; -; FCMP UNE -; - -define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) { -; CHECK-LABEL: fcmp_une_v16f16: +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret + %op1 = load <16 x half>, ptr %a + %op2 = load <16 x half>, ptr %b + %cmp = fcmp one <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, ptr %c + ret void +} + +; +; FCMP UNE +; + +define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) { +; CHECK-LABEL: fcmp_une_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q3, [x1] ; CHECK-NEXT: ptrue p0.h, vl8 @@ -744,119 +910,127 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_une_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, ne -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, ne -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, ne -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, ne -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, ne -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, ne -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, ne -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, ne -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, ne -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -885,119 +1059,127 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ogt_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, gt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, gt -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, gt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, gt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, gt -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, gt -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, gt -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, gt -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, gt -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1029,119 +1211,127 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ugt_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, hi -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, hi -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, hi -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, hi -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, hi -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, hi -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, hi -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, hi -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, hi -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, hi -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1170,123 +1360,131 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_olt_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, mi -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, mi -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, mi -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, mi -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, mi -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, mi -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, mi -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, mi -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, mi -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, mi -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] -; NONEON-NOSVE-NEXT: ret - %op1 = load <16 x half>, ptr %a - %op2 = load <16 x half>, ptr %b - %cmp = fcmp olt <16 x half> %op1, %op2 +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret + %op1 = load <16 x half>, ptr %a + %op2 = load <16 x half>, ptr %b + %cmp = fcmp olt <16 x half> %op1, %op2 %sext = sext <16 x i1> %cmp to <16 x i16> store <16 x i16> %sext, ptr %c ret void @@ -1314,119 +1512,127 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ult_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, lt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, lt -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, lt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, lt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, lt -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, lt -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, lt -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, lt -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, lt -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1455,119 +1661,127 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_oge_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, ge -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, ge -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, ge -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, ge -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, ge -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, ge -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, ge -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, ge -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, ge -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1599,119 +1813,127 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_uge_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, pl -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, pl -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, pl -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, pl -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, pl -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, pl -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, pl -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, pl -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, pl -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, pl -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, pl +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1740,263 +1962,279 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ole_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, ls -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, ls -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, ls -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, ls -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, ls -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, ls -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, ls -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, ls -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, ls -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, ls +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ret + %op1 = load <16 x half>, ptr %a + %op2 = load <16 x half>, ptr %b + %cmp = fcmp ole <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, ptr %c + ret void +} + +; +; FCMP ULE +; + +define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) { +; CHECK-LABEL: fcmp_ule_v16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h +; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h +; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: eor z1.d, z1.d, z0.d +; CHECK-NEXT: eor z0.d, z2.d, z0.d +; CHECK-NEXT: stp q1, q0, [x2] +; CHECK-NEXT: ret +; +; NONEON-NOSVE-LABEL: fcmp_ule_v16f16: +; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 -; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 -; NONEON-NOSVE-NEXT: csetm w8, ls -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] -; NONEON-NOSVE-NEXT: ret - %op1 = load <16 x half>, ptr %a - %op2 = load <16 x half>, ptr %b - %cmp = fcmp ole <16 x half> %op1, %op2 - %sext = sext <16 x i1> %cmp to <16 x i16> - store <16 x i16> %sext, ptr %c - ret void -} - -; -; FCMP ULE -; - -define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) { -; CHECK-LABEL: fcmp_ule_v16f16: -; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q3, [x1] -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: ldp q1, q2, [x0] -; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h -; CHECK-NEXT: mov z0.h, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z1.d, z1.d, z0.d -; CHECK-NEXT: eor z0.d, z2.d, z0.d -; CHECK-NEXT: stp q1, q0, [x2] -; CHECK-NEXT: ret -; -; NONEON-NOSVE-LABEL: fcmp_ule_v16f16: -; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, le -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, le -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, le -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, le -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, le -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, le -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, le -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2025,119 +2263,127 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_uno_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, vs -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, vs -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, vs -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, vs -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, vs -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, vs -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, vs -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, vs -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, vs -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vs +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, vs -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2169,119 +2415,127 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ord_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, vc -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, vc -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, vc -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, vc -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, vc -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, vc -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, vc -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, vc -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, vc -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, vc +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, vc -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2310,119 +2564,127 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_eq_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, eq -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, eq -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, eq -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, eq -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, eq -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, eq -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2451,119 +2713,127 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ne_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, ne -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, ne -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, ne -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, ne -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, ne -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, ne -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, ne -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, ne -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, ne -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2592,119 +2862,127 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_gt_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, gt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, gt -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, gt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, gt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, gt -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, gt -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, gt -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, gt -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, gt -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, gt -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2733,119 +3011,127 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_lt_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, lt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, lt -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, lt -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, lt -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, lt -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, lt -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, lt -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, lt -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, lt -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, lt -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -2874,119 +3160,127 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_ge_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, ge -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, ge -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, ge -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, ge -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, ge -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, ge -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, ge -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, ge -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, ge -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, ge -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -3015,119 +3309,127 @@ define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fcmp_le_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] -; NONEON-NOSVE-NEXT: mov h0, v2.h[1] -; NONEON-NOSVE-NEXT: mov h3, v1.h[1] -; NONEON-NOSVE-NEXT: mov h4, v2.h[2] -; NONEON-NOSVE-NEXT: mov h5, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h2 -; NONEON-NOSVE-NEXT: fcvt s7, h1 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h0, v2.h[3] -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[4] -; NONEON-NOSVE-NEXT: mov h7, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w12, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v2.h[5] -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w11, le -; NONEON-NOSVE-NEXT: fcmp s3, s0 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w9, le -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: csetm w10, le -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[1] -; NONEON-NOSVE-NEXT: mov h5, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: csetm w13, le -; NONEON-NOSVE-NEXT: fcmp s7, s3 -; NONEON-NOSVE-NEXT: fmov s7, w12 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: csetm w14, le -; NONEON-NOSVE-NEXT: fcmp s6, s2 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: mov v7.h[1], w8 -; NONEON-NOSVE-NEXT: csetm w15, le -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h4, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: mov v7.h[2], w11 -; NONEON-NOSVE-NEXT: csetm w16, le -; NONEON-NOSVE-NEXT: fcmp s5, s2 -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: csetm w17, le -; NONEON-NOSVE-NEXT: mov v7.h[3], w9 -; NONEON-NOSVE-NEXT: fmov s2, w17 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: mov h4, v0.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[1], w16 -; NONEON-NOSVE-NEXT: mov v7.h[4], w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: mov h5, v1.h[5] -; NONEON-NOSVE-NEXT: mov h6, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: mov v7.h[5], w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s4, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: mov h4, v0.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v7.h[6], w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s6, s5 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v7.h[7], w15 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: fcmp s4, s3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: fcmp s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcmp s1, s0 ; NONEON-NOSVE-NEXT: csetm w8, le -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: stp q2, q7, [x2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x2] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll index 055af194be211a..2c08977320e848 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll @@ -21,13 +21,28 @@ define void @fp_convert_combine_crash(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fp_convert_combine_crash: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov v0.4s, #8.00000000 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmul v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmul v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0, #3 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1, #3 +; NONEON-NOSVE-NEXT: fcvtzs w10, s2, #3 +; NONEON-NOSVE-NEXT: fcvtzs w11, s0, #3 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s0, s3, [sp] +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzs w12, s1, #3 +; NONEON-NOSVE-NEXT: fcvtzs w8, s2, #3 +; NONEON-NOSVE-NEXT: stp w11, w10, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w9, s3, #3 +; NONEON-NOSVE-NEXT: fcvtzs w10, s0, #3 +; NONEON-NOSVE-NEXT: stp w8, w12, [sp, #40] +; NONEON-NOSVE-NEXT: stp w10, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %f = load <8 x float>, ptr %a %mul.i = fmul <8 x float> %f, %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v2f16_to_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fpext <2 x half> %a to <2 x float> store <2 x float> %res, ptr %b @@ -41,8 +49,22 @@ define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v4f16_to_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fpext <4 x half> %a to <4 x float> store <4 x float> %res, ptr %b @@ -64,13 +86,33 @@ define void @fcvt_v8f16_to_v8f32(<8 x half> %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v8f16_to_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %res = fpext <8 x half> %a to <8 x float> store <8 x float> %res, ptr %b @@ -99,17 +141,57 @@ define void @fcvt_v16f16_to_v16f32(<16 x half> %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v16f16_to_v16f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: stp q0, q3, [x0] -; NONEON-NOSVE-NEXT: stp q1, q2, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #120] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %res = fpext <16 x half> %a to <16 x float> store <16 x float> %res, ptr %b @@ -132,9 +214,20 @@ define void @fcvt_v2f16_v2f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x half>, ptr %a %res = fpext <2 x half> %op1 to <2 x float> @@ -153,9 +246,23 @@ define void @fcvt_v4f16_v4f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %res = fpext <4 x half> %op1 to <4 x float> @@ -178,13 +285,33 @@ define void @fcvt_v8f16_v8f32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fpext <8 x half> %op1 to <8 x float> @@ -214,17 +341,57 @@ define void @fcvt_v16f16_v16f32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #120] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fpext <16 x half> %op1 to <16 x float> @@ -246,9 +413,14 @@ define void @fcvt_v1f16_v1f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v1f16_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: ldr h0, [x0] ; NONEON-NOSVE-NEXT: fcvt d0, h0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <1 x half>, ptr %a %res = fpext <1 x half> %op1 to <1 x double> @@ -267,10 +439,26 @@ define void @fcvt_v2f16_v2f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v2f16_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: str w8, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x half>, ptr %a %res = fpext <2 x half> %op1 to <2 x double> @@ -292,15 +480,35 @@ define void @fcvt_v4f16_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v4f16_v4f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %res = fpext <4 x half> %op1 to <4 x double> @@ -329,22 +537,61 @@ define void @fcvt_v8f16_v8f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v8f16_v8f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s -; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s -; NONEON-NOSVE-NEXT: stp q0, q2, [x1] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #92] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #88] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #144] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #84] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #80] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #128] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #72] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #68] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #64] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fpext <8 x half> %op1 to <8 x double> @@ -390,34 +637,115 @@ define void @fcvt_v16f16_v16f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v16f16_v16f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #336 +; NONEON-NOSVE-NEXT: str x29, [sp, #320] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 336 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #72] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #40] -; NONEON-NOSVE-NEXT: fcvtl v5.2d, v5.2s -; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s -; NONEON-NOSVE-NEXT: fcvtl v4.2d, v4.2s -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v7.2s -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v6.2s -; NONEON-NOSVE-NEXT: stp q2, q0, [x1, #32] -; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #320] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #96] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #66] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #64] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #128] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #70] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #68] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #120] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s1, h0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #152] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #136] +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp, #120] +; NONEON-NOSVE-NEXT: str d1, [sp, #328] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #104] +; NONEON-NOSVE-NEXT: str d0, [sp, #168] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #164] +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #176] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #160] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #156] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #152] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #148] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #144] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #140] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #136] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #192] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #332] +; NONEON-NOSVE-NEXT: ldp q4, q3, [sp, #192] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #328] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #304] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #188] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #184] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #288] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #180] +; NONEON-NOSVE-NEXT: ldp q7, q6, [sp, #288] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #176] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #272] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #172] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #168] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #256] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #224] +; NONEON-NOSVE-NEXT: ldp q2, q5, [sp, #256] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #336 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fpext <16 x half> %op1 to <16 x double> @@ -440,7 +768,7 @@ define void @fcvt_v1f32_v1f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v1f32_v1f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: fcvt d0, s0 ; NONEON-NOSVE-NEXT: str d0, [x1] ; NONEON-NOSVE-NEXT: ret %op1 = load <1 x float>, ptr %a @@ -460,9 +788,18 @@ define void @fcvt_v2f32_v2f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x float>, ptr %a %res = fpext <2 x float> %op1 to <2 x double> @@ -485,13 +822,23 @@ define void @fcvt_v4f32_v4f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %res = fpext <4 x float> %op1 to <4 x double> @@ -521,17 +868,37 @@ define void @fcvt_v8f32_v8f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s -; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt d1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt d0, s0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fpext <8 x float> %op1 to <8 x double> @@ -554,9 +921,21 @@ define void @fcvt_v2f32_v2f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v2f32_v2f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: str s0, [x1] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x float>, ptr %a %res = fptrunc <2 x float> %op1 to <2 x half> @@ -576,8 +955,23 @@ define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v4f32_v4f16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %res = fptrunc <4 x float> %op1 to <4 x half> @@ -599,10 +993,36 @@ define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v8f32_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptrunc <8 x float> %op1 to <8 x half> @@ -647,11 +1067,19 @@ define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: mov d1, v0.d[1] +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt h0, d0 -; NONEON-NOSVE-NEXT: fcvt h1, d1 -; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] -; NONEON-NOSVE-NEXT: str s0, [x1] +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, d0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x double>, ptr %a %res = fptrunc <2 x double> %op1 to <2 x half> @@ -673,17 +1101,24 @@ define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: mov d1, v0.d[1] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt h0, d0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, d0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt h0, d0 -; NONEON-NOSVE-NEXT: fcvt h1, d1 -; NONEON-NOSVE-NEXT: mov v0.h[1], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, d2 -; NONEON-NOSVE-NEXT: mov d2, v2.d[1] -; NONEON-NOSVE-NEXT: mov v0.h[2], v1.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, d2 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, d0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptrunc <4 x double> %op1 to <4 x half> @@ -706,8 +1141,7 @@ define void @fcvt_v1f64_v1f32(<1 x double> %op1, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v1f64_v1f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: fcvt s0, d0 ; NONEON-NOSVE-NEXT: str s0, [x0] ; NONEON-NOSVE-NEXT: ret %res = fptrunc <1 x double> %op1 to <1 x float> @@ -726,8 +1160,16 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v2f64_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, d0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptrunc <2 x double> %op1 to <2 x float> store <2 x float> %res, ptr %b @@ -748,10 +1190,22 @@ define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvt_v4f64_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, d0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, d0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptrunc <4 x double> %op1 to <4 x float> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll index 9d2b55903f3141..680cb4fb0a7910 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll @@ -21,14 +21,59 @@ define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) ; ; NONEON-NOSVE-LABEL: fma_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: str d2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h6, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h7, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: ldr h4, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: ldr h5, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: ldr h3, [sp] +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fmul s1, s3, s1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fmul s2, s7, s6 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s2, s0 +; NONEON-NOSVE-NEXT: fmul s2, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s2, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %mul = fmul contract <4 x half> %op1, %op2 %res = fadd contract <4 x half> %mul, %op3 @@ -48,22 +93,107 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) ; ; NONEON-NOSVE-LABEL: fma_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fmul v3.4s, v4.4s, v3.4s -; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v3.4s -; NONEON-NOSVE-NEXT: fadd v1.4s, v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: str q2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h22, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h23, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s3, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: ldr h20, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s22, h22 +; NONEON-NOSVE-NEXT: fcvt s23, h23 +; NONEON-NOSVE-NEXT: ldr h21, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s20, h20 +; NONEON-NOSVE-NEXT: ldr h18, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h19, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h16, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h17, [sp, #6] +; NONEON-NOSVE-NEXT: fmul s5, s1, s3 +; NONEON-NOSVE-NEXT: fcvt s21, h21 +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s19, h19 +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: ldr h6, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h7, [sp, #4] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h4, [sp, #2] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fmul s3, s4, s3 +; NONEON-NOSVE-NEXT: fmul s0, s1, s0 +; NONEON-NOSVE-NEXT: fadd s2, s5, s2 +; NONEON-NOSVE-NEXT: fmul s5, s23, s22 +; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: str h2, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s5, s2 +; NONEON-NOSVE-NEXT: fmul s5, s21, s20 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: str h2, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s5, s2 +; NONEON-NOSVE-NEXT: fmul s5, s19, s18 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: str h2, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s5, s2 +; NONEON-NOSVE-NEXT: fmul s5, s17, s16 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: str h2, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s5, s2 +; NONEON-NOSVE-NEXT: fmul s5, s7, s6 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h5, s5 +; NONEON-NOSVE-NEXT: str h2, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s5, s2 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: str h2, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s2, s3, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: str h1, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %mul = fmul contract <8 x half> %op1, %op2 %res = fadd contract <8 x half> %mul, %op3 @@ -85,42 +215,228 @@ define void @fma_v16f16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fma_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: fcvtl v5.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fmul v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: fmul v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: fmul v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fmul v2.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s -; NONEON-NOSVE-NEXT: ldp q0, q2, [x2] -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v6.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v7.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h -; NONEON-NOSVE-NEXT: fadd v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: fadd v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s -; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #208 +; NONEON-NOSVE-NEXT: stp d15, d14, [sp, #144] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp d13, d12, [sp, #160] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp d11, d10, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp d9, d8, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 208 +; NONEON-NOSVE-NEXT: .cfi_offset b8, -8 +; NONEON-NOSVE-NEXT: .cfi_offset b9, -16 +; NONEON-NOSVE-NEXT: .cfi_offset b10, -24 +; NONEON-NOSVE-NEXT: .cfi_offset b11, -32 +; NONEON-NOSVE-NEXT: .cfi_offset b12, -40 +; NONEON-NOSVE-NEXT: .cfi_offset b13, -48 +; NONEON-NOSVE-NEXT: .cfi_offset b14, -56 +; NONEON-NOSVE-NEXT: .cfi_offset b15, -64 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q18, q19, [x2] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h24, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h25, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #78] +; NONEON-NOSVE-NEXT: str q19, [sp, #96] +; NONEON-NOSVE-NEXT: str q18, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h18, [sp, #110] +; NONEON-NOSVE-NEXT: ldr h15, [sp, #92] +; NONEON-NOSVE-NEXT: fcvt s20, h0 +; NONEON-NOSVE-NEXT: fcvt s21, h1 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: ldr h13, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h14, [sp, #74] +; NONEON-NOSVE-NEXT: str h0, [sp, #14] // 2-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h12, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h9, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h10, [sp, #70] +; NONEON-NOSVE-NEXT: fmul s30, s21, s20 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: ldr h31, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h28, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h29, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h26, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h27, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h22, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h23, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h20, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h21, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt h19, s30 +; NONEON-NOSVE-NEXT: fcvt s30, h15 +; NONEON-NOSVE-NEXT: ldr h16, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h17, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h6, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h7, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: ldr h4, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h5, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s7, h7 +; NONEON-NOSVE-NEXT: fcvt s19, h19 +; NONEON-NOSVE-NEXT: fmul s0, s0, s30 +; NONEON-NOSVE-NEXT: fcvt s30, h14 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt s5, h5 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: fmul s16, s17, s16 +; NONEON-NOSVE-NEXT: fmul s6, s7, s6 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s18, s19, s18 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s19, h13 +; NONEON-NOSVE-NEXT: fmul s4, s5, s4 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: ldp d15, d14, [sp, #144] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: fcvt h16, s16 +; NONEON-NOSVE-NEXT: fcvt h6, s6 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt h4, s4 +; NONEON-NOSVE-NEXT: fmul s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #14] // 2-byte Folded Reload +; NONEON-NOSVE-NEXT: fcvt s16, h16 +; NONEON-NOSVE-NEXT: fcvt s6, h6 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: str h18, [sp, #142] +; NONEON-NOSVE-NEXT: ldr h18, [sp, #108] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fmul s1, s1, s3 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s18 +; NONEON-NOSVE-NEXT: fmul s18, s30, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h11 +; NONEON-NOSVE-NEXT: fcvt s30, h12 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: ldp d13, d12, [sp, #160] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: str h0, [sp, #140] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #106] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s30, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h9 +; NONEON-NOSVE-NEXT: fcvt s30, h10 +; NONEON-NOSVE-NEXT: ldp d11, d10, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #138] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #104] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s30, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h31 +; NONEON-NOSVE-NEXT: fcvt s30, h8 +; NONEON-NOSVE-NEXT: ldp d9, d8, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #136] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #102] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s30, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h28 +; NONEON-NOSVE-NEXT: fcvt s28, h29 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #134] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #100] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s28, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h26 +; NONEON-NOSVE-NEXT: fcvt s26, h27 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #132] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #98] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s26, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h24 +; NONEON-NOSVE-NEXT: fcvt s24, h25 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #130] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #96] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s24, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h22 +; NONEON-NOSVE-NEXT: fcvt s22, h23 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #128] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s22, s19 +; NONEON-NOSVE-NEXT: fcvt s19, h20 +; NONEON-NOSVE-NEXT: fcvt s20, h21 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #126] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fmul s18, s20, s19 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h18, s18 +; NONEON-NOSVE-NEXT: str h0, [sp, #124] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s18, h18 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s18, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #122] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s16, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #120] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s6, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #118] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s4, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #116] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s2, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #114] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #112] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #112] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #208 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -144,8 +460,19 @@ define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %o ; ; NONEON-NOSVE-LABEL: fma_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmla v2.2s, v1.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %mul = fmul contract <2 x float> %op1, %op2 %res = fadd contract <2 x float> %mul, %op3 @@ -165,8 +492,26 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o ; ; NONEON-NOSVE-LABEL: fma_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmla v2.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %mul = fmul contract <4 x float> %op1, %op2 %res = fadd contract <4 x float> %mul, %op3 @@ -188,12 +533,45 @@ define void @fma_v8f32(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fma_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q4, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q5, [x2] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fmla v1.4s, v0.4s, v2.4s -; NONEON-NOSVE-NEXT: fmla v5.4s, v4.4s, v3.4s -; NONEON-NOSVE-NEXT: stp q1, q5, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q4, q2, [sp] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #92] +; NONEON-NOSVE-NEXT: stp q1, q5, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #56] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #88] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #48] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #120] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #84] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #80] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp] +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #104] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fmadd s5, s4, s3, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd s0, s2, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s5, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -212,7 +590,12 @@ define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double ; ; NONEON-NOSVE-LABEL: fma_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmadd d0, d0, d1, d2 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %mul = fmul contract <1 x double> %op1, %op2 %res = fadd contract <1 x double> %mul, %op3 @@ -232,8 +615,19 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double ; ; NONEON-NOSVE-LABEL: fma_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmla v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d2, d4, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd d5, d4, d3, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d5, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %mul = fmul contract <2 x double> %op1, %op2 %res = fadd contract <2 x double> %mul, %op3 @@ -255,12 +649,31 @@ define void @fma_v4f64(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: fma_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q4, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q5, [x2] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fmla v1.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: fmla v5.2d, v4.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q1, q5, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x2] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q4, q2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #88] +; NONEON-NOSVE-NEXT: stp q1, q5, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp d2, d4, [sp, #48] +; NONEON-NOSVE-NEXT: fmadd d5, d4, d3, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #80] +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d2, d4, [sp] +; NONEON-NOSVE-NEXT: stp d0, d5, [sp, #112] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: fmadd d5, d4, d3, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: fmadd d0, d2, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d5, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll index a96adfec2ad105..775cac272cde9d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll @@ -21,34 +21,39 @@ define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[2] -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: fcvt s7, h0 -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fmaxnm s5, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: fmaxnm s3, s4, s3 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h0, s5 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h2, s3 -; NONEON-NOSVE-NEXT: fmaxnm s1, s4, s1 -; NONEON-NOSVE-NEXT: mov v0.h[2], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %op1, <4 x half> %op2) ret <4 x half> %res @@ -66,60 +71,66 @@ define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h7, v0.h[2] -; NONEON-NOSVE-NEXT: mov h16, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fmaxnm s3, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s4 -; NONEON-NOSVE-NEXT: fmaxnm s4, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] -; NONEON-NOSVE-NEXT: mov h7, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fmaxnm s5, s5, s16 -; NONEON-NOSVE-NEXT: mov h16, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: mov v2.h[1], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt s3, h6 -; NONEON-NOSVE-NEXT: fcvt s6, h7 -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h5, s5 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov v2.h[2], v4.h[0] -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fmaxnm s3, s6, s3 -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[3], v5.h[0] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h6 -; NONEON-NOSVE-NEXT: fmaxnm s6, s16, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v2.h[4], v3.h[0] -; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 -; NONEON-NOSVE-NEXT: fcvt h3, s6 -; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[5], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v2.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %op1, <8 x half> %op2) ret <8 x half> %res @@ -139,115 +150,127 @@ define void @fmaxnm_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: mov h7, v0.h[1] -; NONEON-NOSVE-NEXT: mov h16, v0.h[2] -; NONEON-NOSVE-NEXT: mov h18, v2.h[1] -; NONEON-NOSVE-NEXT: mov h5, v1.h[1] -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h17, v3.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s19, h0 -; NONEON-NOSVE-NEXT: fcvt s20, h3 -; NONEON-NOSVE-NEXT: fcvt s21, h2 -; NONEON-NOSVE-NEXT: mov h22, v3.h[2] -; NONEON-NOSVE-NEXT: mov h23, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fmaxnm s4, s19, s4 -; NONEON-NOSVE-NEXT: mov h19, v0.h[3] -; NONEON-NOSVE-NEXT: mov h24, v3.h[3] -; NONEON-NOSVE-NEXT: fmaxnm s20, s21, s20 -; NONEON-NOSVE-NEXT: fcvt s21, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov h23, v2.h[3] -; NONEON-NOSVE-NEXT: mov h25, v2.h[6] -; NONEON-NOSVE-NEXT: fmaxnm s5, s7, s5 -; NONEON-NOSVE-NEXT: mov h7, v1.h[3] -; NONEON-NOSVE-NEXT: fmaxnm s6, s16, s6 -; NONEON-NOSVE-NEXT: fmaxnm s16, s18, s17 -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s18, h19 -; NONEON-NOSVE-NEXT: fcvt s19, h24 -; NONEON-NOSVE-NEXT: mov h24, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h17, s5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt h5, s20 -; NONEON-NOSVE-NEXT: fmaxnm s20, s22, s21 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt s21, h23 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: mov h22, v0.h[4] -; NONEON-NOSVE-NEXT: mov h23, v2.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[1], v17.h[0] -; NONEON-NOSVE-NEXT: mov h17, v1.h[4] -; NONEON-NOSVE-NEXT: fmaxnm s7, s18, s7 -; NONEON-NOSVE-NEXT: mov h18, v3.h[4] -; NONEON-NOSVE-NEXT: mov v5.h[1], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s20 -; NONEON-NOSVE-NEXT: fmaxnm s19, s21, s19 -; NONEON-NOSVE-NEXT: fcvt s20, h23 -; NONEON-NOSVE-NEXT: mov h21, v1.h[5] -; NONEON-NOSVE-NEXT: mov h23, v2.h[5] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: mov v4.h[2], v6.h[0] -; NONEON-NOSVE-NEXT: fcvt s6, h17 -; NONEON-NOSVE-NEXT: fcvt s17, h22 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: mov h22, v3.h[5] -; NONEON-NOSVE-NEXT: mov v5.h[2], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s19 -; NONEON-NOSVE-NEXT: mov h19, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmaxnm s6, s17, s6 -; NONEON-NOSVE-NEXT: mov h17, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fmaxnm s18, s20, s18 -; NONEON-NOSVE-NEXT: mov h20, v3.h[6] -; NONEON-NOSVE-NEXT: mov v4.h[3], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt s7, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov v5.h[3], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt s16, h21 -; NONEON-NOSVE-NEXT: fcvt s21, h24 -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fcvt s23, h25 -; NONEON-NOSVE-NEXT: fcvt h18, s18 -; NONEON-NOSVE-NEXT: fcvt s20, h20 -; NONEON-NOSVE-NEXT: mov h3, v3.h[7] -; NONEON-NOSVE-NEXT: fmaxnm s7, s22, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmaxnm s16, s21, s16 -; NONEON-NOSVE-NEXT: mov v4.h[4], v6.h[0] -; NONEON-NOSVE-NEXT: fmaxnm s6, s19, s17 -; NONEON-NOSVE-NEXT: mov v5.h[4], v18.h[0] -; NONEON-NOSVE-NEXT: fmaxnm s17, s23, s20 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s17 -; NONEON-NOSVE-NEXT: mov v5.h[5], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v4.h[5], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s2 -; NONEON-NOSVE-NEXT: mov v5.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[6], v6.h[0] -; NONEON-NOSVE-NEXT: mov v5.h[7], v1.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: stp q5, q4, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -268,7 +291,17 @@ define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxnm v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %op1, <2 x float> %op2) ret <2 x float> %res @@ -286,7 +319,22 @@ define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxnm v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %op1, <4 x float> %op2) ret <4 x float> %res @@ -306,11 +354,39 @@ define void @fmaxnm_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmaxnm v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmaxnm v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmaxnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -327,7 +403,12 @@ define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmaxnm d0, d0, d1 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.maxnum.v1f64(<1 x double> %op1, <1 x double> %op2) ret <1 x double> %res @@ -345,7 +426,16 @@ define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxnm v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmaxnm d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmaxnm d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %op1, <2 x double> %op2) ret <2 x double> %res @@ -365,11 +455,27 @@ define void @fmaxnm_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmaxnm_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmaxnm v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fmaxnm v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fmaxnm d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fmaxnm d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmaxnm d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmaxnm d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -394,34 +500,39 @@ define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fminnm_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[2] -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: fcvt s7, h0 -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s2, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fminnm s5, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: fminnm s3, s4, s3 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h0, s5 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h2, s3 -; NONEON-NOSVE-NEXT: fminnm s1, s4, s1 -; NONEON-NOSVE-NEXT: mov v0.h[2], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.minnum.v4f16(<4 x half> %op1, <4 x half> %op2) ret <4 x half> %res @@ -439,60 +550,66 @@ define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fminnm_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h7, v0.h[2] -; NONEON-NOSVE-NEXT: mov h16, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fminnm s3, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s4 -; NONEON-NOSVE-NEXT: fminnm s4, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] -; NONEON-NOSVE-NEXT: mov h7, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fminnm s5, s5, s16 -; NONEON-NOSVE-NEXT: mov h16, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: mov v2.h[1], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt s3, h6 -; NONEON-NOSVE-NEXT: fcvt s6, h7 -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h5, s5 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov v2.h[2], v4.h[0] -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fminnm s3, s6, s3 -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[3], v5.h[0] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h6 -; NONEON-NOSVE-NEXT: fminnm s6, s16, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v2.h[4], v3.h[0] -; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 -; NONEON-NOSVE-NEXT: fcvt h3, s6 -; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[5], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v2.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.minnum.v8f16(<8 x half> %op1, <8 x half> %op2) ret <8 x half> %res @@ -512,115 +629,127 @@ define void @fminnm_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fminnm_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: mov h7, v0.h[1] -; NONEON-NOSVE-NEXT: mov h16, v0.h[2] -; NONEON-NOSVE-NEXT: mov h18, v2.h[1] -; NONEON-NOSVE-NEXT: mov h5, v1.h[1] -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h17, v3.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s19, h0 -; NONEON-NOSVE-NEXT: fcvt s20, h3 -; NONEON-NOSVE-NEXT: fcvt s21, h2 -; NONEON-NOSVE-NEXT: mov h22, v3.h[2] -; NONEON-NOSVE-NEXT: mov h23, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fminnm s4, s19, s4 -; NONEON-NOSVE-NEXT: mov h19, v0.h[3] -; NONEON-NOSVE-NEXT: mov h24, v3.h[3] -; NONEON-NOSVE-NEXT: fminnm s20, s21, s20 -; NONEON-NOSVE-NEXT: fcvt s21, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov h23, v2.h[3] -; NONEON-NOSVE-NEXT: mov h25, v2.h[6] -; NONEON-NOSVE-NEXT: fminnm s5, s7, s5 -; NONEON-NOSVE-NEXT: mov h7, v1.h[3] -; NONEON-NOSVE-NEXT: fminnm s6, s16, s6 -; NONEON-NOSVE-NEXT: fminnm s16, s18, s17 -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s18, h19 -; NONEON-NOSVE-NEXT: fcvt s19, h24 -; NONEON-NOSVE-NEXT: mov h24, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h17, s5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt h5, s20 -; NONEON-NOSVE-NEXT: fminnm s20, s22, s21 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt s21, h23 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: mov h22, v0.h[4] -; NONEON-NOSVE-NEXT: mov h23, v2.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[1], v17.h[0] -; NONEON-NOSVE-NEXT: mov h17, v1.h[4] -; NONEON-NOSVE-NEXT: fminnm s7, s18, s7 -; NONEON-NOSVE-NEXT: mov h18, v3.h[4] -; NONEON-NOSVE-NEXT: mov v5.h[1], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s20 -; NONEON-NOSVE-NEXT: fminnm s19, s21, s19 -; NONEON-NOSVE-NEXT: fcvt s20, h23 -; NONEON-NOSVE-NEXT: mov h21, v1.h[5] -; NONEON-NOSVE-NEXT: mov h23, v2.h[5] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: mov v4.h[2], v6.h[0] -; NONEON-NOSVE-NEXT: fcvt s6, h17 -; NONEON-NOSVE-NEXT: fcvt s17, h22 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: mov h22, v3.h[5] -; NONEON-NOSVE-NEXT: mov v5.h[2], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s19 -; NONEON-NOSVE-NEXT: mov h19, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fminnm s6, s17, s6 -; NONEON-NOSVE-NEXT: mov h17, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fminnm s18, s20, s18 -; NONEON-NOSVE-NEXT: mov h20, v3.h[6] -; NONEON-NOSVE-NEXT: mov v4.h[3], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt s7, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov v5.h[3], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt s16, h21 -; NONEON-NOSVE-NEXT: fcvt s21, h24 -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fcvt s23, h25 -; NONEON-NOSVE-NEXT: fcvt h18, s18 -; NONEON-NOSVE-NEXT: fcvt s20, h20 -; NONEON-NOSVE-NEXT: mov h3, v3.h[7] -; NONEON-NOSVE-NEXT: fminnm s7, s22, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fminnm s16, s21, s16 -; NONEON-NOSVE-NEXT: mov v4.h[4], v6.h[0] -; NONEON-NOSVE-NEXT: fminnm s6, s19, s17 -; NONEON-NOSVE-NEXT: mov v5.h[4], v18.h[0] -; NONEON-NOSVE-NEXT: fminnm s17, s23, s20 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s17 -; NONEON-NOSVE-NEXT: mov v5.h[5], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v4.h[5], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s2 -; NONEON-NOSVE-NEXT: mov v5.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[6], v6.h[0] -; NONEON-NOSVE-NEXT: mov v5.h[7], v1.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: stp q5, q4, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -641,7 +770,17 @@ define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fminnm_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminnm v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.minnum.v2f32(<2 x float> %op1, <2 x float> %op2) ret <2 x float> %res @@ -659,7 +798,22 @@ define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fminnm_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminnm v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.minnum.v4f32(<4 x float> %op1, <4 x float> %op2) ret <4 x float> %res @@ -679,11 +833,39 @@ define void @fminnm_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fminnm_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fminnm v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fminnm v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fminnm s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -700,7 +882,12 @@ define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fminnm_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fminnm d0, d0, d1 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.minnum.v1f64(<1 x double> %op1, <1 x double> %op2) ret <1 x double> %res @@ -718,7 +905,16 @@ define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fminnm_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminnm v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fminnm d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fminnm d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.minnum.v2f64(<2 x double> %op1, <2 x double> %op2) ret <2 x double> %res @@ -738,11 +934,27 @@ define void @fminnm_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fminnm_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fminnm v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fminnm v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fminnm d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fminnm d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fminnm d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fminnm d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -767,34 +979,39 @@ define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmax_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[2] -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: fcvt s7, h0 -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s2, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fmax s5, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: fmax s3, s4, s3 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h0, s5 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h2, s3 -; NONEON-NOSVE-NEXT: fmax s1, s4, s1 -; NONEON-NOSVE-NEXT: mov v0.h[2], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %op1, <4 x half> %op2) ret <4 x half> %res @@ -812,60 +1029,66 @@ define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmax_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h7, v0.h[2] -; NONEON-NOSVE-NEXT: mov h16, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmax s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fmax s3, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s4 -; NONEON-NOSVE-NEXT: fmax s4, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] -; NONEON-NOSVE-NEXT: mov h7, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fmax s5, s5, s16 -; NONEON-NOSVE-NEXT: mov h16, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: mov v2.h[1], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt s3, h6 -; NONEON-NOSVE-NEXT: fcvt s6, h7 -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h5, s5 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov v2.h[2], v4.h[0] -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fmax s3, s6, s3 -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[3], v5.h[0] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h6 -; NONEON-NOSVE-NEXT: fmax s6, s16, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v2.h[4], v3.h[0] -; NONEON-NOSVE-NEXT: fmax s4, s5, s4 -; NONEON-NOSVE-NEXT: fcvt h3, s6 -; NONEON-NOSVE-NEXT: fmax s0, s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[5], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v2.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.maximum.v8f16(<8 x half> %op1, <8 x half> %op2) ret <8 x half> %res @@ -885,115 +1108,127 @@ define void @fmax_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmax_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: mov h7, v0.h[1] -; NONEON-NOSVE-NEXT: mov h16, v0.h[2] -; NONEON-NOSVE-NEXT: mov h18, v2.h[1] -; NONEON-NOSVE-NEXT: mov h5, v1.h[1] -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h17, v3.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s19, h0 -; NONEON-NOSVE-NEXT: fcvt s20, h3 -; NONEON-NOSVE-NEXT: fcvt s21, h2 -; NONEON-NOSVE-NEXT: mov h22, v3.h[2] -; NONEON-NOSVE-NEXT: mov h23, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fmax s4, s19, s4 -; NONEON-NOSVE-NEXT: mov h19, v0.h[3] -; NONEON-NOSVE-NEXT: mov h24, v3.h[3] -; NONEON-NOSVE-NEXT: fmax s20, s21, s20 -; NONEON-NOSVE-NEXT: fcvt s21, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov h23, v2.h[3] -; NONEON-NOSVE-NEXT: mov h25, v2.h[6] -; NONEON-NOSVE-NEXT: fmax s5, s7, s5 -; NONEON-NOSVE-NEXT: mov h7, v1.h[3] -; NONEON-NOSVE-NEXT: fmax s6, s16, s6 -; NONEON-NOSVE-NEXT: fmax s16, s18, s17 -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s18, h19 -; NONEON-NOSVE-NEXT: fcvt s19, h24 -; NONEON-NOSVE-NEXT: mov h24, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h17, s5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt h5, s20 -; NONEON-NOSVE-NEXT: fmax s20, s22, s21 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt s21, h23 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: mov h22, v0.h[4] -; NONEON-NOSVE-NEXT: mov h23, v2.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[1], v17.h[0] -; NONEON-NOSVE-NEXT: mov h17, v1.h[4] -; NONEON-NOSVE-NEXT: fmax s7, s18, s7 -; NONEON-NOSVE-NEXT: mov h18, v3.h[4] -; NONEON-NOSVE-NEXT: mov v5.h[1], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s20 -; NONEON-NOSVE-NEXT: fmax s19, s21, s19 -; NONEON-NOSVE-NEXT: fcvt s20, h23 -; NONEON-NOSVE-NEXT: mov h21, v1.h[5] -; NONEON-NOSVE-NEXT: mov h23, v2.h[5] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: mov v4.h[2], v6.h[0] -; NONEON-NOSVE-NEXT: fcvt s6, h17 -; NONEON-NOSVE-NEXT: fcvt s17, h22 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: mov h22, v3.h[5] -; NONEON-NOSVE-NEXT: mov v5.h[2], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s19 -; NONEON-NOSVE-NEXT: mov h19, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmax s6, s17, s6 -; NONEON-NOSVE-NEXT: mov h17, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fmax s18, s20, s18 -; NONEON-NOSVE-NEXT: mov h20, v3.h[6] -; NONEON-NOSVE-NEXT: mov v4.h[3], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt s7, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov v5.h[3], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt s16, h21 -; NONEON-NOSVE-NEXT: fcvt s21, h24 -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fcvt s23, h25 -; NONEON-NOSVE-NEXT: fcvt h18, s18 -; NONEON-NOSVE-NEXT: fcvt s20, h20 -; NONEON-NOSVE-NEXT: mov h3, v3.h[7] -; NONEON-NOSVE-NEXT: fmax s7, s22, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmax s16, s21, s16 -; NONEON-NOSVE-NEXT: mov v4.h[4], v6.h[0] -; NONEON-NOSVE-NEXT: fmax s6, s19, s17 -; NONEON-NOSVE-NEXT: mov v5.h[4], v18.h[0] -; NONEON-NOSVE-NEXT: fmax s17, s23, s20 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fmax s0, s0, s1 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fmax s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s17 -; NONEON-NOSVE-NEXT: mov v5.h[5], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v4.h[5], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s2 -; NONEON-NOSVE-NEXT: mov v5.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[6], v6.h[0] -; NONEON-NOSVE-NEXT: mov v5.h[7], v1.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: stp q5, q4, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1014,7 +1249,17 @@ define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmax_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmax v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %op1, <2 x float> %op2) ret <2 x float> %res @@ -1032,7 +1277,22 @@ define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmax_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmax v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.maximum.v4f32(<4 x float> %op1, <4 x float> %op2) ret <4 x float> %res @@ -1052,11 +1312,39 @@ define void @fmax_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmax_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmax v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmax v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmax s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -1073,7 +1361,12 @@ define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmax_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmax d0, d0, d1 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.maximum.v1f64(<1 x double> %op1, <1 x double> %op2) ret <1 x double> %res @@ -1091,7 +1384,16 @@ define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmax_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmax v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmax d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmax d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.maximum.v2f64(<2 x double> %op1, <2 x double> %op2) ret <2 x double> %res @@ -1111,11 +1413,27 @@ define void @fmax_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmax_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmax v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fmax v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fmax d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fmax d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmax d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmax d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -1140,34 +1458,39 @@ define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmin_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[2] -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: fcvt s7, h0 -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s2, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h4 -; NONEON-NOSVE-NEXT: fcvt s4, h5 -; NONEON-NOSVE-NEXT: fmin s5, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v0.h[3] -; NONEON-NOSVE-NEXT: fmin s3, s4, s3 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h0, s5 -; NONEON-NOSVE-NEXT: fcvt s4, h6 -; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h2, s3 -; NONEON-NOSVE-NEXT: fmin s1, s4, s1 -; NONEON-NOSVE-NEXT: mov v0.h[2], v2.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: mov v0.h[3], v1.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %op1, <4 x half> %op2) ret <4 x half> %res @@ -1185,60 +1508,66 @@ define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: fmin_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h7, v0.h[2] -; NONEON-NOSVE-NEXT: mov h16, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmin s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fmin s3, s3, s2 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s4 -; NONEON-NOSVE-NEXT: fmin s4, s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] -; NONEON-NOSVE-NEXT: mov h7, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fmin s5, s5, s16 -; NONEON-NOSVE-NEXT: mov h16, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: mov v2.h[1], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt s3, h6 -; NONEON-NOSVE-NEXT: fcvt s6, h7 -; NONEON-NOSVE-NEXT: mov h7, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h5, s5 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov v2.h[2], v4.h[0] -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fmin s3, s6, s3 -; NONEON-NOSVE-NEXT: mov h6, v0.h[6] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[3], v5.h[0] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h6 -; NONEON-NOSVE-NEXT: fmin s6, s16, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov v2.h[4], v3.h[0] -; NONEON-NOSVE-NEXT: fmin s4, s5, s4 -; NONEON-NOSVE-NEXT: fcvt h3, s6 -; NONEON-NOSVE-NEXT: fmin s0, s0, s1 -; NONEON-NOSVE-NEXT: mov v2.h[5], v3.h[0] -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v2.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v2.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.minimum.v8f16(<8 x half> %op1, <8 x half> %op2) ret <8 x half> %res @@ -1258,115 +1587,127 @@ define void @fmin_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmin_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: mov h7, v0.h[1] -; NONEON-NOSVE-NEXT: mov h16, v0.h[2] -; NONEON-NOSVE-NEXT: mov h18, v2.h[1] -; NONEON-NOSVE-NEXT: mov h5, v1.h[1] -; NONEON-NOSVE-NEXT: mov h6, v1.h[2] -; NONEON-NOSVE-NEXT: mov h17, v3.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s19, h0 -; NONEON-NOSVE-NEXT: fcvt s20, h3 -; NONEON-NOSVE-NEXT: fcvt s21, h2 -; NONEON-NOSVE-NEXT: mov h22, v3.h[2] -; NONEON-NOSVE-NEXT: mov h23, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fmin s4, s19, s4 -; NONEON-NOSVE-NEXT: mov h19, v0.h[3] -; NONEON-NOSVE-NEXT: mov h24, v3.h[3] -; NONEON-NOSVE-NEXT: fmin s20, s21, s20 -; NONEON-NOSVE-NEXT: fcvt s21, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov h23, v2.h[3] -; NONEON-NOSVE-NEXT: mov h25, v2.h[6] -; NONEON-NOSVE-NEXT: fmin s5, s7, s5 -; NONEON-NOSVE-NEXT: mov h7, v1.h[3] -; NONEON-NOSVE-NEXT: fmin s6, s16, s6 -; NONEON-NOSVE-NEXT: fmin s16, s18, s17 -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s18, h19 -; NONEON-NOSVE-NEXT: fcvt s19, h24 -; NONEON-NOSVE-NEXT: mov h24, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h17, s5 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcvt h5, s20 -; NONEON-NOSVE-NEXT: fmin s20, s22, s21 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt s21, h23 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: mov h22, v0.h[4] -; NONEON-NOSVE-NEXT: mov h23, v2.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[1], v17.h[0] -; NONEON-NOSVE-NEXT: mov h17, v1.h[4] -; NONEON-NOSVE-NEXT: fmin s7, s18, s7 -; NONEON-NOSVE-NEXT: mov h18, v3.h[4] -; NONEON-NOSVE-NEXT: mov v5.h[1], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s20 -; NONEON-NOSVE-NEXT: fmin s19, s21, s19 -; NONEON-NOSVE-NEXT: fcvt s20, h23 -; NONEON-NOSVE-NEXT: mov h21, v1.h[5] -; NONEON-NOSVE-NEXT: mov h23, v2.h[5] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] -; NONEON-NOSVE-NEXT: mov v4.h[2], v6.h[0] -; NONEON-NOSVE-NEXT: fcvt s6, h17 -; NONEON-NOSVE-NEXT: fcvt s17, h22 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fcvt s18, h18 -; NONEON-NOSVE-NEXT: mov h22, v3.h[5] -; NONEON-NOSVE-NEXT: mov v5.h[2], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h16, s19 -; NONEON-NOSVE-NEXT: mov h19, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmin s6, s17, s6 -; NONEON-NOSVE-NEXT: mov h17, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fmin s18, s20, s18 -; NONEON-NOSVE-NEXT: mov h20, v3.h[6] -; NONEON-NOSVE-NEXT: mov v4.h[3], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt s7, h22 -; NONEON-NOSVE-NEXT: fcvt s22, h23 -; NONEON-NOSVE-NEXT: mov v5.h[3], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt s16, h21 -; NONEON-NOSVE-NEXT: fcvt s21, h24 -; NONEON-NOSVE-NEXT: fcvt s19, h19 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fcvt s23, h25 -; NONEON-NOSVE-NEXT: fcvt h18, s18 -; NONEON-NOSVE-NEXT: fcvt s20, h20 -; NONEON-NOSVE-NEXT: mov h3, v3.h[7] -; NONEON-NOSVE-NEXT: fmin s7, s22, s7 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fmin s16, s21, s16 -; NONEON-NOSVE-NEXT: mov v4.h[4], v6.h[0] -; NONEON-NOSVE-NEXT: fmin s6, s19, s17 -; NONEON-NOSVE-NEXT: mov v5.h[4], v18.h[0] -; NONEON-NOSVE-NEXT: fmin s17, s23, s20 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt h7, s7 -; NONEON-NOSVE-NEXT: fmin s0, s0, s1 -; NONEON-NOSVE-NEXT: fcvt h16, s16 -; NONEON-NOSVE-NEXT: fcvt h6, s6 -; NONEON-NOSVE-NEXT: fmin s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s17 -; NONEON-NOSVE-NEXT: mov v5.h[5], v7.h[0] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: mov v4.h[5], v16.h[0] -; NONEON-NOSVE-NEXT: fcvt h1, s2 -; NONEON-NOSVE-NEXT: mov v5.h[6], v3.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[6], v6.h[0] -; NONEON-NOSVE-NEXT: mov v5.h[7], v1.h[0] -; NONEON-NOSVE-NEXT: mov v4.h[7], v0.h[0] -; NONEON-NOSVE-NEXT: stp q5, q4, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -1387,7 +1728,17 @@ define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmin_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmin v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %op1, <2 x float> %op2) ret <2 x float> %res @@ -1405,7 +1756,22 @@ define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: fmin_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmin v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.minimum.v4f32(<4 x float> %op1, <4 x float> %op2) ret <4 x float> %res @@ -1425,11 +1791,39 @@ define void @fmin_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmin_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmin v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmin v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fmin s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -1446,7 +1840,12 @@ define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmin_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmin d0, d0, d1 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.minimum.v1f64(<1 x double> %op1, <1 x double> %op2) ret <1 x double> %res @@ -1464,7 +1863,16 @@ define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: fmin_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmin v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmin d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmin d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.minimum.v2f64(<2 x double> %op1, <2 x double> %op2) ret <2 x double> %res @@ -1484,11 +1892,27 @@ define void @fmin_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fmin_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmin v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fmin v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fmin d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fmin d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fmin d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fmin d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll index f1561011e21812..f081d4ac65b279 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce-fa64.ll @@ -30,26 +30,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) { ; ; NONEON-NOSVE-LABEL: fadda_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) ret half %res diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll index a0a7dad835662e..4eaaee7ce5055d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll @@ -23,26 +23,30 @@ define half @fadda_v4f16(half %start, <4 x half> %a) { ; ; NONEON-NOSVE-LABEL: fadda_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) ret half %res @@ -71,45 +75,49 @@ define half @fadda_v8f16(half %start, <8 x half> %a) { ; ; NONEON-NOSVE-LABEL: fadda_v8f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h1, [sp] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[3] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[4] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[5] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a) ret half %res @@ -154,86 +162,93 @@ define half @fadda_v16f16(half %start, ptr %a) { ; ; NONEON-NOSVE-LABEL: fadda_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s2, h1 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[3] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[4] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[5] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: fcvt s2, h1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[3] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[4] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[5] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #28] ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op) @@ -251,10 +266,13 @@ define float @fadda_v2f32(float %start, <2 x float> %a) { ; ; NONEON-NOSVE-LABEL: fadda_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov s2, v1.s[1] -; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] ; NONEON-NOSVE-NEXT: fadd s0, s0, s2 +; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a) ret float %res @@ -275,13 +293,15 @@ define float @fadda_v4f32(float %start, <4 x float> %a) { ; ; NONEON-NOSVE-LABEL: fadda_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov s2, v1.s[1] +; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp] +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 -; NONEON-NOSVE-NEXT: mov s3, v1.s[2] -; NONEON-NOSVE-NEXT: mov s1, v1.s[3] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] ; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: fadd s0, s0, s3 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a) ret float %res @@ -310,22 +330,25 @@ define float @fadda_v8f32(float %start, ptr %a) { ; ; NONEON-NOSVE-LABEL: fadda_v8f32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: mov s2, v1.s[1] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp] +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 -; NONEON-NOSVE-NEXT: mov s3, v1.s[2] -; NONEON-NOSVE-NEXT: mov s1, v1.s[3] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] ; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: fadd s0, s0, s3 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 -; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: mov s2, v1.s[1] -; NONEON-NOSVE-NEXT: mov s3, v1.s[2] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s0, s2 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 -; NONEON-NOSVE-NEXT: mov s1, v1.s[3] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #24] ; NONEON-NOSVE-NEXT: fadd s0, s0, s2 -; NONEON-NOSVE-NEXT: fadd s0, s0, s3 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op) @@ -357,9 +380,11 @@ define double @fadda_v2f64(double %start, <2 x double> %a) { ; ; NONEON-NOSVE-LABEL: fadda_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov d2, v1.d[1] -; NONEON-NOSVE-NEXT: fadd d0, d0, d1 +; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp], #16 ; NONEON-NOSVE-NEXT: fadd d0, d0, d2 +; NONEON-NOSVE-NEXT: fadd d0, d0, d1 ; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a) ret double %res @@ -380,13 +405,19 @@ define double @fadda_v4f64(double %start, ptr %a) { ; ; NONEON-NOSVE-LABEL: fadda_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q3, q1, [x0] -; NONEON-NOSVE-NEXT: mov d2, v3.d[1] -; NONEON-NOSVE-NEXT: fadd d0, d0, d3 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp] ; NONEON-NOSVE-NEXT: fadd d0, d0, d2 -; NONEON-NOSVE-NEXT: mov d2, v1.d[1] ; NONEON-NOSVE-NEXT: fadd d0, d0, d1 +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp, #16] ; NONEON-NOSVE-NEXT: fadd d0, d0, d2 +; NONEON-NOSVE-NEXT: fadd d0, d0, d1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op) @@ -408,26 +439,30 @@ define half @faddv_v4f16(half %start, <4 x half> %a) { ; ; NONEON-NOSVE-LABEL: faddv_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s3, h1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s3, s2 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: mov h1, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 ; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a) ret half %res @@ -444,45 +479,49 @@ define half @faddv_v8f16(half %start, <8 x half> %a) { ; ; NONEON-NOSVE-LABEL: faddv_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s3, h1 +; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s3, s2 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[6] -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 ; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a) ret half %res @@ -500,54 +539,90 @@ define half @faddv_v16f16(half %start, ptr %a) { ; ; NONEON-NOSVE-LABEL: faddv_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fadd v3.4s, v4.4s, v3.4s -; NONEON-NOSVE-NEXT: fadd v1.4s, v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v1.4s -; NONEON-NOSVE-NEXT: mov h1, v2.h[1] -; NONEON-NOSVE-NEXT: fcvt s3, h2 +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h4, [sp] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fadd s1, s3, s1 -; NONEON-NOSVE-NEXT: mov h3, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fadd s1, s1, s3 -; NONEON-NOSVE-NEXT: mov h3, v2.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fadd s2, s4, s3 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h4, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fadd s1, s1, s3 -; NONEON-NOSVE-NEXT: mov h3, v2.h[4] +; NONEON-NOSVE-NEXT: fcvt s4, h4 ; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fadd s3, s4, s3 +; NONEON-NOSVE-NEXT: ldr h4, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fadd s1, s1, s3 -; NONEON-NOSVE-NEXT: mov h3, v2.h[5] +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fcvt h2, s3 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s3, h3 ; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s3, s4, s3 +; NONEON-NOSVE-NEXT: ldr h4, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h2, s3 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #24] ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s3, s4, s3 +; NONEON-NOSVE-NEXT: ldr h4, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fadd s1, s1, s3 -; NONEON-NOSVE-NEXT: mov h3, v2.h[6] -; NONEON-NOSVE-NEXT: mov h2, v2.h[7] +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h2, s3 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s3, h3 ; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s3, s4, s3 +; NONEON-NOSVE-NEXT: ldr h4, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: fcvt h2, s3 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #28] ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fadd s1, s1, s3 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: fadd s2, s4, s3 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h4, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 +; NONEON-NOSVE-NEXT: fadd s2, s4, s3 ; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h2, s2 ; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fadd s1, s1, s2 ; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op) @@ -565,8 +640,13 @@ define float @faddv_v2f32(float %start, <2 x float> %a) { ; ; NONEON-NOSVE-LABEL: faddv_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: faddp s1, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fadd s1, s2, s1 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a) ret float %res @@ -583,8 +663,13 @@ define float @faddv_v4f32(float %start, <4 x float> %a) { ; ; NONEON-NOSVE-LABEL: faddv_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: faddp v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: faddp s1, v1.2s +; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s4, s3, [sp], #16 +; NONEON-NOSVE-NEXT: fadd s3, s4, s3 +; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fadd s1, s3, s1 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 ; NONEON-NOSVE-NEXT: ret %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a) @@ -604,10 +689,21 @@ define float @faddv_v8f32(float %start, ptr %a) { ; NONEON-NOSVE-LABEL: faddv_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] -; NONEON-NOSVE-NEXT: fadd v1.4s, v2.4s, v1.4s -; NONEON-NOSVE-NEXT: faddp v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: faddp s1, v1.2s +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s4, s3, [sp] +; NONEON-NOSVE-NEXT: ldp s5, s6, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s7, s16, [sp, #8] +; NONEON-NOSVE-NEXT: fadd s1, s3, s1 +; NONEON-NOSVE-NEXT: fadd s2, s4, s2 +; NONEON-NOSVE-NEXT: fadd s3, s7, s5 +; NONEON-NOSVE-NEXT: fadd s4, s16, s6 +; NONEON-NOSVE-NEXT: fadd s1, s2, s1 +; NONEON-NOSVE-NEXT: fadd s2, s3, s4 +; NONEON-NOSVE-NEXT: fadd s1, s1, s2 ; NONEON-NOSVE-NEXT: fadd s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op) @@ -639,7 +735,10 @@ define double @faddv_v2f64(double %start, <2 x double> %a) { ; ; NONEON-NOSVE-LABEL: faddv_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: faddp d1, v1.2d +; NONEON-NOSVE-NEXT: str q1, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp], #16 +; NONEON-NOSVE-NEXT: fadd d1, d2, d1 ; NONEON-NOSVE-NEXT: fadd d0, d0, d1 ; NONEON-NOSVE-NEXT: ret %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a) @@ -659,8 +758,13 @@ define double @faddv_v4f64(double %start, ptr %a) { ; NONEON-NOSVE-LABEL: faddv_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] -; NONEON-NOSVE-NEXT: fadd v1.2d, v2.2d, v1.2d -; NONEON-NOSVE-NEXT: faddp d1, v1.2d +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d4, d3, [sp], #32 +; NONEON-NOSVE-NEXT: fadd d1, d3, d1 +; NONEON-NOSVE-NEXT: fadd d2, d4, d2 +; NONEON-NOSVE-NEXT: fadd d1, d2, d1 ; NONEON-NOSVE-NEXT: fadd d0, d0, d1 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a @@ -683,22 +787,26 @@ define half @fmaxv_v4f16(<4 x half> %a) { ; ; NONEON-NOSVE-LABEL: fmaxv_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a) ret half %res @@ -715,41 +823,45 @@ define half @fmaxv_v8f16(<8 x half> %a) { ; ; NONEON-NOSVE-LABEL: fmaxv_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a) ret half %res @@ -767,81 +879,86 @@ define half @fmaxv_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fmaxv_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h3, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] -; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fmaxnm s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmaxnm s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fmaxnm s2, s4, s2 -; NONEON-NOSVE-NEXT: mov h4, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[4] -; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #22] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmaxnm s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[5] -; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #24] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmaxnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #26] ; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h3 ; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 -; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fmaxnm s3, s5, s4 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fmaxnm s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fmaxnm s1, s3, s2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op) @@ -859,7 +976,12 @@ define float @fmaxv_v2f32(<2 x float> %a) { ; ; NONEON-NOSVE-LABEL: fmaxv_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxnmp s0, v0.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a) ret float %res @@ -876,7 +998,14 @@ define float @fmaxv_v4f32(<4 x float> %a) { ; ; NONEON-NOSVE-LABEL: fmaxv_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxnmv s0, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s2 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) ret float %res @@ -895,8 +1024,20 @@ define float @fmaxv_v8f32(ptr %a) { ; NONEON-NOSVE-LABEL: fmaxv_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fmaxnm v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmaxnmv s0, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s3, s2, [sp] +; NONEON-NOSVE-NEXT: fmaxnm s0, s2, s0 +; NONEON-NOSVE-NEXT: fmaxnm s1, s3, s1 +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: fmaxnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s3, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fmaxnm s2, s2, s3 +; NONEON-NOSVE-NEXT: fmaxnm s1, s4, s1 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s2 +; NONEON-NOSVE-NEXT: fmaxnm s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op) @@ -926,7 +1067,10 @@ define double @fmaxv_v2f64(<2 x double> %a) { ; ; NONEON-NOSVE-LABEL: fmaxv_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxnmp d0, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp], #16 +; NONEON-NOSVE-NEXT: fmaxnm d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a) ret double %res @@ -945,8 +1089,13 @@ define double @fmaxv_v4f64(ptr %a) { ; NONEON-NOSVE-LABEL: fmaxv_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fmaxnm v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fmaxnmp d0, v0.2d +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d3, d2, [sp], #32 +; NONEON-NOSVE-NEXT: fmaxnm d0, d2, d0 +; NONEON-NOSVE-NEXT: fmaxnm d1, d3, d1 +; NONEON-NOSVE-NEXT: fmaxnm d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op) @@ -968,22 +1117,26 @@ define half @fminv_v4f16(<4 x half> %a) { ; ; NONEON-NOSVE-LABEL: fminv_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a) ret half %res @@ -1000,41 +1153,45 @@ define half @fminv_v8f16(<8 x half> %a) { ; ; NONEON-NOSVE-LABEL: fminv_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a) ret half %res @@ -1052,81 +1209,86 @@ define half @fminv_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fminv_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h3, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fminnm s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fminnm s2, s3, s2 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #22] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fminnm s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fminnm s2, s4, s2 -; NONEON-NOSVE-NEXT: mov h4, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #24] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[4] -; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #26] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fminnm s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[5] -; NONEON-NOSVE-NEXT: fminnm s2, s2, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fminnm s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #28] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fminnm s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: fminnm s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 -; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fminnm s3, s5, s4 +; NONEON-NOSVE-NEXT: fminnm s1, s3, s2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h1, s2 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op) @@ -1144,7 +1306,12 @@ define float @fminv_v2f32(<2 x float> %a) { ; ; NONEON-NOSVE-LABEL: fminv_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminnmp s0, v0.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a) ret float %res @@ -1161,7 +1328,14 @@ define float @fminv_v4f32(<4 x float> %a) { ; ; NONEON-NOSVE-LABEL: fminv_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminnmv s0, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fminnm s0, s0, s2 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) ret float %res @@ -1180,8 +1354,20 @@ define float @fminv_v8f32(ptr %a) { ; NONEON-NOSVE-LABEL: fminv_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fminnm v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fminnmv s0, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s3, s2, [sp] +; NONEON-NOSVE-NEXT: fminnm s0, s2, s0 +; NONEON-NOSVE-NEXT: fminnm s1, s3, s1 +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: fminnm s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s3, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fminnm s2, s2, s3 +; NONEON-NOSVE-NEXT: fminnm s1, s4, s1 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s2 +; NONEON-NOSVE-NEXT: fminnm s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op) @@ -1211,7 +1397,10 @@ define double @fminv_v2f64(<2 x double> %a) { ; ; NONEON-NOSVE-LABEL: fminv_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminnmp d0, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp], #16 +; NONEON-NOSVE-NEXT: fminnm d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a) ret double %res @@ -1230,8 +1419,13 @@ define double @fminv_v4f64(ptr %a) { ; NONEON-NOSVE-LABEL: fminv_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fminnm v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fminnmp d0, v0.2d +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d3, d2, [sp], #32 +; NONEON-NOSVE-NEXT: fminnm d0, d2, d0 +; NONEON-NOSVE-NEXT: fminnm d1, d3, d1 +; NONEON-NOSVE-NEXT: fminnm d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op) @@ -1253,22 +1447,26 @@ define half @fmaximumv_v4f16(<4 x half> %a) { ; ; NONEON-NOSVE-LABEL: fmaximumv_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a) ret half %res @@ -1285,41 +1483,45 @@ define half @fmaximumv_v8f16(<8 x half> %a) { ; ; NONEON-NOSVE-LABEL: fmaximumv_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> %a) ret half %res @@ -1337,81 +1539,86 @@ define half @fmaximumv_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fmaximumv_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h3, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fmax s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmax s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fmax s2, s3, s2 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #22] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmax s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fmax s2, s4, s2 -; NONEON-NOSVE-NEXT: mov h4, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #24] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmax s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[4] -; NONEON-NOSVE-NEXT: fmax s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #26] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmax s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[5] -; NONEON-NOSVE-NEXT: fmax s2, s2, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmax s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #28] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmax s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fmax s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: fmax s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fmax s0, s0, s1 -; NONEON-NOSVE-NEXT: fmax s2, s2, s3 -; NONEON-NOSVE-NEXT: fmax s3, s5, s4 +; NONEON-NOSVE-NEXT: fmax s1, s3, s2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmax s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h1, s2 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> %op) @@ -1429,7 +1636,12 @@ define float @fmaximumv_v2f32(<2 x float> %a) { ; ; NONEON-NOSVE-LABEL: fmaximumv_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxp s0, v0.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a) ret float %res @@ -1446,7 +1658,14 @@ define float @fmaximumv_v4f32(<4 x float> %a) { ; ; NONEON-NOSVE-LABEL: fmaximumv_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxv s0, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fmax s0, s0, s2 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a) ret float %res @@ -1465,8 +1684,20 @@ define float @fmaximumv_v8f32(ptr %a) { ; NONEON-NOSVE-LABEL: fmaximumv_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fmax v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fmaxv s0, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s3, s2, [sp] +; NONEON-NOSVE-NEXT: fmax s0, s2, s0 +; NONEON-NOSVE-NEXT: fmax s1, s3, s1 +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: fmax s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s3, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fmax s2, s2, s3 +; NONEON-NOSVE-NEXT: fmax s1, s4, s1 +; NONEON-NOSVE-NEXT: fmax s0, s0, s2 +; NONEON-NOSVE-NEXT: fmax s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %op) @@ -1496,7 +1727,10 @@ define double @fmaximumv_v2f64(<2 x double> %a) { ; ; NONEON-NOSVE-LABEL: fmaximumv_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmaxp d0, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp], #16 +; NONEON-NOSVE-NEXT: fmax d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a) ret double %res @@ -1515,8 +1749,13 @@ define double @fmaximumv_v4f64(ptr %a) { ; NONEON-NOSVE-LABEL: fmaximumv_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fmax v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fmaxp d0, v0.2d +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d3, d2, [sp], #32 +; NONEON-NOSVE-NEXT: fmax d0, d2, d0 +; NONEON-NOSVE-NEXT: fmax d1, d3, d1 +; NONEON-NOSVE-NEXT: fmax d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %op) @@ -1538,22 +1777,26 @@ define half @fminimumv_v4f16(<4 x half> %a) { ; ; NONEON-NOSVE-LABEL: fminimumv_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: mov h0, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %a) ret half %res @@ -1570,41 +1813,45 @@ define half @fminimumv_v8f16(<8 x half> %a) { ; ; NONEON-NOSVE-LABEL: fminimumv_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s2, s1 -; NONEON-NOSVE-NEXT: mov h2, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s1, s2 -; NONEON-NOSVE-NEXT: mov h2, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fcvt h1, s1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 ; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s1, s1, s2 -; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s0, s1, s0 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> %a) ret half %res @@ -1622,81 +1869,86 @@ define half @fminimumv_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fminimumv_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s4, h1 -; NONEON-NOSVE-NEXT: fcvt s5, h0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: ldr h2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h3, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt s2, h2 +; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fmin s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #4] ; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmin s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fmin s2, s3, s2 -; NONEON-NOSVE-NEXT: mov h3, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #22] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmin s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[3] -; NONEON-NOSVE-NEXT: fmin s2, s4, s2 -; NONEON-NOSVE-NEXT: mov h4, v1.h[3] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #24] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmin s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[4] -; NONEON-NOSVE-NEXT: fmin s2, s2, s3 -; NONEON-NOSVE-NEXT: mov h3, v1.h[4] -; NONEON-NOSVE-NEXT: fcvt h4, s4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #26] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmin s3, s5, s3 -; NONEON-NOSVE-NEXT: mov h5, v0.h[5] -; NONEON-NOSVE-NEXT: fmin s2, s2, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[5] -; NONEON-NOSVE-NEXT: fcvt h3, s3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fmin s2, s3, s2 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s3, h3 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fcvt h1, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #28] ; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fmin s4, s5, s4 -; NONEON-NOSVE-NEXT: mov h5, v0.h[6] -; NONEON-NOSVE-NEXT: mov h0, v0.h[7] -; NONEON-NOSVE-NEXT: fmin s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h3, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[6] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: mov h1, v1.h[7] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: fmin s1, s3, s2 +; NONEON-NOSVE-NEXT: ldr h2, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h3, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: fcvt h1, s1 +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 ; NONEON-NOSVE-NEXT: fmin s0, s0, s1 -; NONEON-NOSVE-NEXT: fmin s2, s2, s3 -; NONEON-NOSVE-NEXT: fmin s3, s5, s4 +; NONEON-NOSVE-NEXT: fmin s1, s3, s2 ; NONEON-NOSVE-NEXT: fcvt h0, s0 -; NONEON-NOSVE-NEXT: fcvt h2, s2 -; NONEON-NOSVE-NEXT: fcvt h3, s3 +; NONEON-NOSVE-NEXT: fcvt h1, s1 ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fmin s2, s2, s3 -; NONEON-NOSVE-NEXT: fcvt h1, s2 ; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 ; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> %op) @@ -1714,7 +1966,12 @@ define float @fminimumv_v2f32(<2 x float> %a) { ; ; NONEON-NOSVE-LABEL: fminimumv_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminp s0, v0.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %a) ret float %res @@ -1731,7 +1988,14 @@ define float @fminimumv_v4f32(<4 x float> %a) { ; ; NONEON-NOSVE-LABEL: fminimumv_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminv s0, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s2, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fmin s0, s0, s2 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a) ret float %res @@ -1750,8 +2014,20 @@ define float @fminimumv_v8f32(ptr %a) { ; NONEON-NOSVE-LABEL: fminimumv_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fmin v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fminv s0, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s3, s2, [sp] +; NONEON-NOSVE-NEXT: fmin s0, s2, s0 +; NONEON-NOSVE-NEXT: fmin s1, s3, s1 +; NONEON-NOSVE-NEXT: ldp s2, s4, [sp, #8] +; NONEON-NOSVE-NEXT: fmin s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s3, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fmin s2, s2, s3 +; NONEON-NOSVE-NEXT: fmin s1, s4, s1 +; NONEON-NOSVE-NEXT: fmin s0, s0, s2 +; NONEON-NOSVE-NEXT: fmin s0, s0, s1 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %op) @@ -1781,7 +2057,10 @@ define double @fminimumv_v2f64(<2 x double> %a) { ; ; NONEON-NOSVE-LABEL: fminimumv_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fminp d0, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp], #16 +; NONEON-NOSVE-NEXT: fmin d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %res = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a) ret double %res @@ -1800,8 +2079,13 @@ define double @fminimumv_v4f64(ptr %a) { ; NONEON-NOSVE-LABEL: fminimumv_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: fmin v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fminp d0, v0.2d +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d3, d2, [sp], #32 +; NONEON-NOSVE-NEXT: fmin d0, d2, d0 +; NONEON-NOSVE-NEXT: fmin d1, d3, d1 +; NONEON-NOSVE-NEXT: fmin d0, d1, d0 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll index 6af2b885ace08f..344aac5b198384 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll @@ -20,9 +20,30 @@ define <2 x half> @frintp_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.ceil.v2f16(<2 x half> %op) ret <2 x half> %res @@ -39,9 +60,30 @@ define <4 x half> @frintp_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.ceil.v4f16(<4 x half> %op) ret <4 x half> %res @@ -58,12 +100,50 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frintp v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frintp v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.ceil.v8f16(<8 x half> %op) ret <8 x half> %res @@ -81,20 +161,92 @@ define void @frintp_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintp_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frintp v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frintp v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintp v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.ceil.v16f16(<16 x half> %op) @@ -113,7 +265,15 @@ define <2 x float> @frintp_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintp v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.ceil.v2f32(<2 x float> %op) ret <2 x float> %res @@ -130,7 +290,20 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.ceil.v4f32(<4 x float> %op) ret <4 x float> %res @@ -148,10 +321,32 @@ define void @frintp_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintp_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintp v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintp v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintp s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintp s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.ceil.v8f32(<8 x float> %op) @@ -167,7 +362,12 @@ define <1 x double> @frintp_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frintp d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.ceil.v1f64(<1 x double> %op) ret <1 x double> %res @@ -184,7 +384,15 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintp_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintp v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintp d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintp d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.ceil.v2f64(<2 x double> %op) ret <2 x double> %res @@ -202,10 +410,22 @@ define void @frintp_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintp_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintp v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frintp v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frintp d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frintp d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintp d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintp d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.ceil.v4f64(<4 x double> %op) @@ -228,9 +448,30 @@ define <2 x half> @frintm_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.floor.v2f16(<2 x half> %op) ret <2 x half> %res @@ -247,9 +488,30 @@ define <4 x half> @frintm_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.floor.v4f16(<4 x half> %op) ret <4 x half> %res @@ -266,12 +528,50 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frintm v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frintm v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.floor.v8f16(<8 x half> %op) ret <8 x half> %res @@ -289,20 +589,92 @@ define void @frintm_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintm_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frintm v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frintm v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintm v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.floor.v16f16(<16 x half> %op) @@ -321,7 +693,15 @@ define <2 x float> @frintm_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintm v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.floor.v2f32(<2 x float> %op) ret <2 x float> %res @@ -338,7 +718,20 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.floor.v4f32(<4 x float> %op) ret <4 x float> %res @@ -356,10 +749,32 @@ define void @frintm_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintm_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintm v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintm v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintm s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintm s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.floor.v8f32(<8 x float> %op) @@ -375,7 +790,12 @@ define <1 x double> @frintm_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frintm d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.floor.v1f64(<1 x double> %op) ret <1 x double> %res @@ -392,7 +812,15 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintm_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintm v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintm d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintm d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.floor.v2f64(<2 x double> %op) ret <2 x double> %res @@ -410,10 +838,22 @@ define void @frintm_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintm_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintm v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frintm v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frintm d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frintm d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintm d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintm d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.floor.v4f64(<4 x double> %op) @@ -436,9 +876,30 @@ define <2 x half> @frinti_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.nearbyint.v2f16(<2 x half> %op) ret <2 x half> %res @@ -455,9 +916,30 @@ define <4 x half> @frinti_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %op) ret <4 x half> %res @@ -474,12 +956,50 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frinti v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frinti v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %op) ret <8 x half> %res @@ -497,20 +1017,92 @@ define void @frinti_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frinti_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frinti v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frinti v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frinti v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %op) @@ -529,7 +1121,15 @@ define <2 x float> @frinti_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frinti v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %op) ret <2 x float> %res @@ -546,7 +1146,20 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %op) ret <4 x float> %res @@ -564,10 +1177,32 @@ define void @frinti_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frinti_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frinti v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frinti v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frinti s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frinti s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> %op) @@ -583,7 +1218,12 @@ define <1 x double> @frinti_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frinti d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> %op) ret <1 x double> %res @@ -600,7 +1240,15 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frinti_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frinti v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frinti d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frinti d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %op) ret <2 x double> %res @@ -618,10 +1266,22 @@ define void @frinti_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frinti_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frinti v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frinti v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frinti d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frinti d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frinti d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frinti d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %op) @@ -644,9 +1304,30 @@ define <2 x half> @frintx_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.rint.v2f16(<2 x half> %op) ret <2 x half> %res @@ -663,9 +1344,30 @@ define <4 x half> @frintx_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.rint.v4f16(<4 x half> %op) ret <4 x half> %res @@ -682,12 +1384,50 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frintx v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frintx v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.rint.v8f16(<8 x half> %op) ret <8 x half> %res @@ -705,20 +1445,92 @@ define void @frintx_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintx_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frintx v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frintx v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintx v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.rint.v16f16(<16 x half> %op) @@ -737,7 +1549,15 @@ define <2 x float> @frintx_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintx v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.rint.v2f32(<2 x float> %op) ret <2 x float> %res @@ -754,7 +1574,20 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.rint.v4f32(<4 x float> %op) ret <4 x float> %res @@ -772,10 +1605,32 @@ define void @frintx_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintx_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintx v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintx v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintx s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintx s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.rint.v8f32(<8 x float> %op) @@ -791,7 +1646,12 @@ define <1 x double> @frintx_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frintx d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.rint.v1f64(<1 x double> %op) ret <1 x double> %res @@ -808,7 +1668,15 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintx_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintx v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintx d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintx d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.rint.v2f64(<2 x double> %op) ret <2 x double> %res @@ -826,10 +1694,22 @@ define void @frintx_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintx_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintx v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frintx v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frintx d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frintx d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintx d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintx d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.rint.v4f64(<4 x double> %op) @@ -852,9 +1732,30 @@ define <2 x half> @frinta_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.round.v2f16(<2 x half> %op) ret <2 x half> %res @@ -871,9 +1772,30 @@ define <4 x half> @frinta_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.round.v4f16(<4 x half> %op) ret <4 x half> %res @@ -890,12 +1812,50 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frinta v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frinta v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.round.v8f16(<8 x half> %op) ret <8 x half> %res @@ -913,20 +1873,92 @@ define void @frinta_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frinta_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frinta v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frinta v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frinta v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.round.v16f16(<16 x half> %op) @@ -945,7 +1977,15 @@ define <2 x float> @frinta_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frinta v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.round.v2f32(<2 x float> %op) ret <2 x float> %res @@ -962,7 +2002,20 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.round.v4f32(<4 x float> %op) ret <4 x float> %res @@ -980,10 +2033,32 @@ define void @frinta_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frinta_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frinta v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frinta v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frinta s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frinta s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.round.v8f32(<8 x float> %op) @@ -999,7 +2074,12 @@ define <1 x double> @frinta_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frinta d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.round.v1f64(<1 x double> %op) ret <1 x double> %res @@ -1016,7 +2096,15 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frinta_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frinta v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frinta d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frinta d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.round.v2f64(<2 x double> %op) ret <2 x double> %res @@ -1034,10 +2122,22 @@ define void @frinta_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frinta_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frinta v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frinta v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frinta d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frinta d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frinta d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frinta d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.round.v4f64(<4 x double> %op) @@ -1060,9 +2160,30 @@ define <2 x half> @frintn_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.roundeven.v2f16(<2 x half> %op) ret <2 x half> %res @@ -1079,9 +2200,30 @@ define <4 x half> @frintn_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.roundeven.v4f16(<4 x half> %op) ret <4 x half> %res @@ -1098,12 +2240,50 @@ define <8 x half> @frintn_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frintn v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frintn v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.roundeven.v8f16(<8 x half> %op) ret <8 x half> %res @@ -1121,20 +2301,92 @@ define void @frintn_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintn_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frintn v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frintn v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintn v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.roundeven.v16f16(<16 x half> %op) @@ -1153,7 +2405,15 @@ define <2 x float> @frintn_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintn v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.roundeven.v2f32(<2 x float> %op) ret <2 x float> %res @@ -1170,7 +2430,20 @@ define <4 x float> @frintn_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %op) ret <4 x float> %res @@ -1188,10 +2461,32 @@ define void @frintn_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintn_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintn v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintn v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintn s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintn s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %op) @@ -1207,7 +2502,12 @@ define <1 x double> @frintn_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frintn d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.roundeven.v1f64(<1 x double> %op) ret <1 x double> %res @@ -1224,7 +2524,15 @@ define <2 x double> @frintn_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintn_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintn v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintn d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintn d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %op) ret <2 x double> %res @@ -1242,10 +2550,22 @@ define void @frintn_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintn_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintn v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frintn v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frintn d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frintn d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintn d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintn d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %op) @@ -1268,9 +2588,30 @@ define <2 x half> @frintz_v2f16(<2 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x half> @llvm.trunc.v2f16(<2 x half> %op) ret <2 x half> %res @@ -1287,9 +2628,30 @@ define <4 x half> @frintz_v4f16(<4 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x half> @llvm.trunc.v4f16(<4 x half> %op) ret <4 x half> %res @@ -1306,12 +2668,50 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v0.8h -; NONEON-NOSVE-NEXT: frintz v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v1.4s -; NONEON-NOSVE-NEXT: frintz v1.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x half> @llvm.trunc.v8f16(<8 x half> %op) ret <8 x half> %res @@ -1329,20 +2729,92 @@ define void @frintz_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintz_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: frintz v2.4s, v2.4s -; NONEON-NOSVE-NEXT: frintz v3.4s, v3.4s -; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintz v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v3.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v1.4s -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x half>, ptr %a %res = call <16 x half> @llvm.trunc.v16f16(<16 x half> %op) @@ -1361,7 +2833,15 @@ define <2 x float> @frintz_v2f32(<2 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintz v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x float> @llvm.trunc.v2f32(<2 x float> %op) ret <2 x float> %res @@ -1378,7 +2858,20 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x float> @llvm.trunc.v4f32(<4 x float> %op) ret <4 x float> %res @@ -1396,10 +2889,32 @@ define void @frintz_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintz_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintz v0.4s, v0.4s -; NONEON-NOSVE-NEXT: frintz v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: frintz s1, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: frintz s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x float>, ptr %a %res = call <8 x float> @llvm.trunc.v8f32(<8 x float> %op) @@ -1415,7 +2930,12 @@ define <1 x double> @frintz_v1f64(<1 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: frintz d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x double> @llvm.trunc.v1f64(<1 x double> %op) ret <1 x double> %res @@ -1432,7 +2952,15 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) { ; ; NONEON-NOSVE-LABEL: frintz_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: frintz v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintz d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintz d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x double> @llvm.trunc.v2f64(<2 x double> %op) ret <2 x double> %res @@ -1450,10 +2978,22 @@ define void @frintz_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: frintz_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: frintz v0.2d, v0.2d -; NONEON-NOSVE-NEXT: frintz v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: frintz d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: frintz d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: frintz d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: frintz d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x double>, ptr %a %res = call <4 x double> @llvm.trunc.v4f64(<4 x double> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll index 824419b31a5a83..daa9b51cc827b3 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll @@ -20,10 +20,28 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v2f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.4h, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x half> %op1, <2 x half> %op2 ret <2 x half> %sel @@ -44,10 +62,28 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.4h, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x half> %op1, <4 x half> %op2 ret <4 x half> %sel @@ -68,10 +104,43 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v8f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.8h, w8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <8 x half> %op1, <8 x half> %op2 ret <8 x half> %sel @@ -95,16 +164,83 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v16f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #62] +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #60] +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #58] +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #56] +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #54] +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #52] +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #50] +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #48] +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <16 x half>, ptr %a %op2 = load volatile <16 x half>, ptr %b @@ -128,10 +264,18 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v2f32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.2s, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s3, s2, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x float> %op1, <2 x float> %op2 ret <2 x float> %sel @@ -152,10 +296,23 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4f32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.4s, w8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fcsel s3, s2, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s3, s2, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x float> %op1, <4 x float> %op2 ret <4 x float> %sel @@ -179,16 +336,43 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v8f32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fcsel s3, s0, s2, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s3, s0, s2, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #56] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: fcsel s3, s0, s2, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #48] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #4] +; NONEON-NOSVE-NEXT: fcsel s3, s0, s2, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp] +; NONEON-NOSVE-NEXT: fcsel s0, s0, s1, ne +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <8 x float>, ptr %a %op2 = load volatile <8 x float>, ptr %b @@ -206,10 +390,13 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask ; ; NONEON-NOSVE-LABEL: select_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: fmov d2, x8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: fcsel d0, d0, d1, ne +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <1 x double> %op1, <1 x double> %op2 ret <1 x double> %sel @@ -231,10 +418,17 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask ; ; NONEON-NOSVE-LABEL: select_v2f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: dup v2.2d, x8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel d3, d2, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x double> %op1, <2 x double> %op2 ret <2 x double> %sel @@ -259,16 +453,31 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel d3, d0, d2, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel d0, d0, d1, ne +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #48] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: fcsel d3, d0, d2, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: fcsel d0, d0, d1, ne +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <4 x double>, ptr %a %op2 = load volatile <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index c853bdc5af8db0..0d92a6fa0fa28d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -19,9 +19,26 @@ define <4 x i16> @fcvtzu_v4f16_v4i16(<4 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <4 x half> %op1 to <4 x i16> ret <4 x i16> %res @@ -39,16 +56,43 @@ define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptoui <8 x half> %op1 to <8 x i16> @@ -69,22 +113,75 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzu v2.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtzu v3.4s, v3.4s -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptoui <16 x half> %op1 to <16 x i16> @@ -108,9 +205,17 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x half> %op1 to <2 x i32> ret <2 x i32> %res @@ -128,8 +233,25 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <4 x half> %op1 to <4 x i32> ret <4 x i32> %res @@ -151,15 +273,41 @@ define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptoui <8 x half> %op1 to <8 x i32> @@ -189,21 +337,73 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzu v2.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtzu v3.4s, v3.4s -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: fcvtzu w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptoui <16 x half> %op1 to <16 x i32> @@ -224,9 +424,13 @@ define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v1f16_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvtzu x8, s0 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x half> %op1 to <1 x i64> ret <1 x i64> %res @@ -246,14 +450,18 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f16_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvtzu x8, s0 -; NONEON-NOSVE-NEXT: fcvtzu x9, s1 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x half> %op1 to <2 x i64> ret <2 x i64> %res @@ -280,23 +488,27 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: mov h1, v0.h[2] -; NONEON-NOSVE-NEXT: mov h2, v0.h[3] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 ; NONEON-NOSVE-NEXT: fcvtzu x9, s0 -; NONEON-NOSVE-NEXT: fcvtzu x8, s1 -; NONEON-NOSVE-NEXT: fcvtzu x10, s2 -; NONEON-NOSVE-NEXT: fcvtzu x11, s3 -; NONEON-NOSVE-NEXT: fmov d1, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 -; NONEON-NOSVE-NEXT: mov v0.d[1], x10 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #16] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %res = fptoui <4 x half> %op1 to <4 x i64> @@ -339,42 +551,43 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: mov h1, v0.h[2] -; NONEON-NOSVE-NEXT: mov h3, v0.h[3] -; NONEON-NOSVE-NEXT: mov h4, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov h5, v2.h[2] -; NONEON-NOSVE-NEXT: mov h6, v2.h[3] -; NONEON-NOSVE-NEXT: mov h7, v2.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: str q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvtzu x9, s0 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvtzu x13, s2 -; NONEON-NOSVE-NEXT: fcvtzu x8, s1 -; NONEON-NOSVE-NEXT: fcvt s1, h7 -; NONEON-NOSVE-NEXT: fcvtzu x10, s3 -; NONEON-NOSVE-NEXT: fcvtzu x11, s4 -; NONEON-NOSVE-NEXT: fcvtzu x12, s5 -; NONEON-NOSVE-NEXT: fcvtzu x14, s6 -; NONEON-NOSVE-NEXT: fmov d3, x13 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: fcvtzu x8, s1 -; NONEON-NOSVE-NEXT: fmov d1, x9 -; NONEON-NOSVE-NEXT: fmov d2, x12 -; NONEON-NOSVE-NEXT: mov v0.d[1], x10 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 -; NONEON-NOSVE-NEXT: mov v3.d[1], x8 -; NONEON-NOSVE-NEXT: mov v2.d[1], x14 -; NONEON-NOSVE-NEXT: stp q1, q0, [x1] -; NONEON-NOSVE-NEXT: stp q3, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q2, q3, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptoui <8 x half> %op1 to <8 x i64> @@ -439,76 +652,79 @@ define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s3, h1 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #24] -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: mov h7, v0.h[2] -; NONEON-NOSVE-NEXT: mov h16, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s6, h0 -; NONEON-NOSVE-NEXT: mov h0, v0.h[1] -; NONEON-NOSVE-NEXT: mov h1, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s17, h4 -; NONEON-NOSVE-NEXT: mov h18, v4.h[2] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvtzu x8, s3 -; NONEON-NOSVE-NEXT: fcvt s3, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h7 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: mov h16, v4.h[3] -; NONEON-NOSVE-NEXT: fcvtzu x9, s6 -; NONEON-NOSVE-NEXT: ldr d6, [sp, #8] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: mov h4, v4.h[1] -; NONEON-NOSVE-NEXT: fcvtzu x11, s2 -; NONEON-NOSVE-NEXT: mov h2, v6.h[2] -; NONEON-NOSVE-NEXT: fcvtzu x10, s17 -; NONEON-NOSVE-NEXT: fcvtzu x13, s5 -; NONEON-NOSVE-NEXT: fcvtzu x12, s3 -; NONEON-NOSVE-NEXT: mov h3, v6.h[3] -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov h5, v6.h[1] -; NONEON-NOSVE-NEXT: fcvt s17, h18 -; NONEON-NOSVE-NEXT: fcvtzu x14, s7 -; NONEON-NOSVE-NEXT: fmov d7, x8 +; NONEON-NOSVE-NEXT: sub sp, sp, #192 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 192 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvtzu x8, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fmov d0, x11 -; NONEON-NOSVE-NEXT: fcvtzu x11, s1 -; NONEON-NOSVE-NEXT: fmov d1, x13 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvtzu x13, s16 -; NONEON-NOSVE-NEXT: fmov d16, x9 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvtzu x15, s17 -; NONEON-NOSVE-NEXT: mov v0.d[1], x12 -; NONEON-NOSVE-NEXT: mov v1.d[1], x14 -; NONEON-NOSVE-NEXT: fcvtzu x9, s2 -; NONEON-NOSVE-NEXT: mov v16.d[1], x8 -; NONEON-NOSVE-NEXT: fcvtzu x8, s6 -; NONEON-NOSVE-NEXT: fcvtzu x14, s4 -; NONEON-NOSVE-NEXT: fcvtzu x12, s3 -; NONEON-NOSVE-NEXT: mov v7.d[1], x11 -; NONEON-NOSVE-NEXT: fmov d3, x10 -; NONEON-NOSVE-NEXT: fcvtzu x11, s5 -; NONEON-NOSVE-NEXT: fmov d2, x15 -; NONEON-NOSVE-NEXT: stp q16, q1, [x1, #64] -; NONEON-NOSVE-NEXT: fmov d1, x9 -; NONEON-NOSVE-NEXT: fmov d4, x8 -; NONEON-NOSVE-NEXT: stp q7, q0, [x1] -; NONEON-NOSVE-NEXT: mov v2.d[1], x13 -; NONEON-NOSVE-NEXT: mov v3.d[1], x14 -; NONEON-NOSVE-NEXT: mov v1.d[1], x12 -; NONEON-NOSVE-NEXT: mov v4.d[1], x11 -; NONEON-NOSVE-NEXT: stp q3, q2, [x1, #96] -; NONEON-NOSVE-NEXT: stp q4, q1, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #96] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #112] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #160] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #176] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #160] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #128] +; NONEON-NOSVE-NEXT: fcvtzu x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #144] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #192 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptoui <16 x half> %op1 to <16 x i64> @@ -531,7 +747,14 @@ define <2 x i16> @fcvtzu_v2f32_v2i16(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x float> %op1 to <2 x i16> ret <2 x i16> %res @@ -549,8 +772,20 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <4 x float> %op1 to <4 x i16> ret <4 x i16> %res @@ -572,10 +807,31 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptoui <8 x float> %op1 to <8 x i16> @@ -604,15 +860,56 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v16f32_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzu v3.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtzu v2.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x float>, ptr %a %res = fptoui <16 x float> %op1 to <16 x i16> @@ -635,7 +932,14 @@ define <2 x i32> @fcvtzu_v2f32_v2i32(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzu v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x float> %op1 to <2 x i32> ret <2 x i32> %res @@ -652,7 +956,18 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <4 x float> %op1 to <4 x i32> ret <4 x i32> %res @@ -670,10 +985,28 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzu v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzu v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzu w9, s1 +; NONEON-NOSVE-NEXT: fcvtzu w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptoui <8 x float> %op1 to <8 x i32> @@ -697,9 +1030,13 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v1f32_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x float> %op1 to <1 x i64> ret <1 x i64> %res @@ -717,8 +1054,15 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f32_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x float> %op1 to <2 x i64> ret <2 x i64> %res @@ -740,15 +1084,21 @@ define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v4f32_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %res = fptoui <4 x float> %op1 to <4 x i64> @@ -778,21 +1128,33 @@ define void @fcvtzu_v8f32_v8i64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzu_v8f32_v8i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s -; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s -; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzu v2.2d, v2.2d -; NONEON-NOSVE-NEXT: fcvtzu v3.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #112] +; NONEON-NOSVE-NEXT: fcvtzu x9, s1 +; NONEON-NOSVE-NEXT: fcvtzu x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptoui <8 x float> %op1 to <8 x i64> @@ -814,8 +1176,12 @@ define <1 x i16> @fcvtzu_v1f64_v1i16(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fcvtzs w8, d0 -; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x double> %op1 to <1 x i16> ret <1 x i16> %res @@ -833,8 +1199,14 @@ define <2 x i16> @fcvtzu_v2f64_v2i16(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x double> %op1 to <2 x i16> ret <2 x i16> %res @@ -867,11 +1239,27 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #78] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: strh w9, [sp, #74] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #72] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptoui <4 x double> %op1 to <4 x i16> @@ -919,19 +1307,49 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] -; NONEON-NOSVE-NEXT: adrp x8, .LCPI26_0 -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d -; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d -; NONEON-NOSVE-NEXT: xtn v7.2s, v0.2d -; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI26_0] -; NONEON-NOSVE-NEXT: xtn v6.2s, v1.2d -; NONEON-NOSVE-NEXT: xtn v5.2s, v2.2d -; NONEON-NOSVE-NEXT: xtn v4.2s, v3.2d -; NONEON-NOSVE-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #72] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #104] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: str d2, [sp, #120] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #142] +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: str d0, [sp, #96] +; NONEON-NOSVE-NEXT: strh w9, [sp, #138] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: strh w9, [sp, #134] +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: strh w9, [sp, #130] +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #128] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptoui <8 x double> %op1 to <8 x i16> @@ -1012,31 +1430,90 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v16f64_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #96] -; NONEON-NOSVE-NEXT: adrp x8, .LCPI27_0 -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q4, q5, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d -; NONEON-NOSVE-NEXT: fcvtzs v5.2d, v5.2d -; NONEON-NOSVE-NEXT: fcvtzs v4.2d, v4.2d -; NONEON-NOSVE-NEXT: fcvtzs v6.2d, v6.2d -; NONEON-NOSVE-NEXT: fcvtzs v7.2d, v7.2d -; NONEON-NOSVE-NEXT: xtn v19.2s, v0.2d -; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI27_0] -; NONEON-NOSVE-NEXT: xtn v23.2s, v3.2d -; NONEON-NOSVE-NEXT: xtn v18.2s, v1.2d -; NONEON-NOSVE-NEXT: xtn v22.2s, v2.2d -; NONEON-NOSVE-NEXT: xtn v17.2s, v5.2d -; NONEON-NOSVE-NEXT: xtn v21.2s, v6.2d -; NONEON-NOSVE-NEXT: xtn v16.2s, v4.2d -; NONEON-NOSVE-NEXT: xtn v20.2s, v7.2d -; NONEON-NOSVE-NEXT: tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b -; NONEON-NOSVE-NEXT: tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: sub sp, sp, #304 +; NONEON-NOSVE-NEXT: str x29, [sp, #288] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 304 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #288] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q6, q7, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0, #96] +; NONEON-NOSVE-NEXT: stp q1, q7, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q6, q4, [sp] +; NONEON-NOSVE-NEXT: stp q5, q3, [sp, #32] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #176] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #168] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #232] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #192] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #232] +; NONEON-NOSVE-NEXT: str d2, [sp, #248] +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #144] +; NONEON-NOSVE-NEXT: strh w9, [sp, #270] +; NONEON-NOSVE-NEXT: strh w8, [sp, #268] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #248] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #200] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #160] +; NONEON-NOSVE-NEXT: strh w9, [sp, #266] +; NONEON-NOSVE-NEXT: strh w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #240] +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #216] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #192] +; NONEON-NOSVE-NEXT: strh w9, [sp, #262] +; NONEON-NOSVE-NEXT: strh w8, [sp, #260] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #200] +; NONEON-NOSVE-NEXT: str d0, [sp, #296] +; NONEON-NOSVE-NEXT: strh w9, [sp, #258] +; NONEON-NOSVE-NEXT: strh w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #216] +; NONEON-NOSVE-NEXT: strh w9, [sp, #286] +; NONEON-NOSVE-NEXT: strh w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #208] +; NONEON-NOSVE-NEXT: strh w9, [sp, #282] +; NONEON-NOSVE-NEXT: strh w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #300] +; NONEON-NOSVE-NEXT: strh w9, [sp, #278] +; NONEON-NOSVE-NEXT: strh w8, [sp, #274] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #296] +; NONEON-NOSVE-NEXT: strh w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #256] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #304 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x double>, ptr %a %res = fptoui <16 x double> %op1 to <16 x i16> @@ -1060,9 +1537,12 @@ define <1 x i32> @fcvtzu_v1f64_v1i32(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x double> %op1 to <1 x i32> ret <1 x i32> %res @@ -1080,8 +1560,14 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x double> %op1 to <2 x i32> ret <2 x i32> %res @@ -1103,10 +1589,19 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptoui <4 x double> %op1 to <4 x i32> @@ -1135,15 +1630,32 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v8f64_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzu v3.2d, v3.2d -; NONEON-NOSVE-NEXT: fcvtzu v2.2d, v2.2d -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzu w9, d1 +; NONEON-NOSVE-NEXT: fcvtzu w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptoui <8 x double> %op1 to <8 x i32> @@ -1166,8 +1678,12 @@ define <1 x i64> @fcvtzu_v1f64_v1i64(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v1f64_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fcvtzu x8, d0 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptoui <1 x double> %op1 to <1 x i64> ret <1 x i64> %res @@ -1184,7 +1700,14 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v2f64_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: fcvtzu x9, d1 +; NONEON-NOSVE-NEXT: fcvtzu x8, d0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptoui <2 x double> %op1 to <2 x i64> ret <2 x i64> %res @@ -1202,10 +1725,20 @@ define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzu_v4f64_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzu v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzu v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzu x9, d1 +; NONEON-NOSVE-NEXT: fcvtzu x8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzu x9, d1 +; NONEON-NOSVE-NEXT: fcvtzu x8, d0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptoui <4 x double> %op1 to <4 x i64> @@ -1228,9 +1761,26 @@ define <4 x i16> @fcvtzs_v4f16_v4i16(<4 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <4 x half> %op1 to <4 x i16> ret <4 x i16> %res @@ -1248,16 +1798,43 @@ define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptosi <8 x half> %op1 to <8 x i16> @@ -1278,22 +1855,75 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v2.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtzs v3.4s, v3.4s -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v1.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v3.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptosi <16 x half> %op1 to <16 x i16> @@ -1317,9 +1947,17 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x half> %op1 to <2 x i32> ret <2 x i32> %res @@ -1337,8 +1975,25 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <4 x half> %op1 to <4 x i32> ret <4 x i32> %res @@ -1360,15 +2015,41 @@ define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptosi <8 x half> %op1 to <8 x i32> @@ -1398,21 +2079,73 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v2.4s, v2.4s -; NONEON-NOSVE-NEXT: fcvtzs v3.4s, v3.4s -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: fcvtzs w9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptosi <16 x half> %op1 to <16 x i32> @@ -1433,9 +2166,13 @@ define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v1f16_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvtzs x8, s0 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x half> %op1 to <1 x i64> ret <1 x i64> %res @@ -1456,14 +2193,18 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f16_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov h1, v0.h[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 ; NONEON-NOSVE-NEXT: fcvtzs x8, s0 -; NONEON-NOSVE-NEXT: fcvtzs x9, s1 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x half> %op1 to <2 x i64> ret <2 x i64> %res @@ -1490,23 +2231,27 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: mov h1, v0.h[2] -; NONEON-NOSVE-NEXT: mov h2, v0.h[3] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] ; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 ; NONEON-NOSVE-NEXT: fcvtzs x9, s0 -; NONEON-NOSVE-NEXT: fcvtzs x8, s1 -; NONEON-NOSVE-NEXT: fcvtzs x10, s2 -; NONEON-NOSVE-NEXT: fcvtzs x11, s3 -; NONEON-NOSVE-NEXT: fmov d1, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 -; NONEON-NOSVE-NEXT: mov v0.d[1], x10 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #16] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %res = fptosi <4 x half> %op1 to <4 x i64> @@ -1549,42 +2294,43 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: mov h1, v0.h[2] -; NONEON-NOSVE-NEXT: mov h3, v0.h[3] -; NONEON-NOSVE-NEXT: mov h4, v0.h[1] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: mov h5, v2.h[2] -; NONEON-NOSVE-NEXT: mov h6, v2.h[3] -; NONEON-NOSVE-NEXT: mov h7, v2.h[1] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 +; NONEON-NOSVE-NEXT: str q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvtzs x9, s0 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvtzs x13, s2 -; NONEON-NOSVE-NEXT: fcvtzs x8, s1 -; NONEON-NOSVE-NEXT: fcvt s1, h7 -; NONEON-NOSVE-NEXT: fcvtzs x10, s3 -; NONEON-NOSVE-NEXT: fcvtzs x11, s4 -; NONEON-NOSVE-NEXT: fcvtzs x12, s5 -; NONEON-NOSVE-NEXT: fcvtzs x14, s6 -; NONEON-NOSVE-NEXT: fmov d3, x13 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: fcvtzs x8, s1 -; NONEON-NOSVE-NEXT: fmov d1, x9 -; NONEON-NOSVE-NEXT: fmov d2, x12 -; NONEON-NOSVE-NEXT: mov v0.d[1], x10 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 -; NONEON-NOSVE-NEXT: mov v3.d[1], x8 -; NONEON-NOSVE-NEXT: mov v2.d[1], x14 -; NONEON-NOSVE-NEXT: stp q1, q0, [x1] -; NONEON-NOSVE-NEXT: stp q3, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q2, q3, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %res = fptosi <8 x half> %op1 to <8 x i64> @@ -1649,76 +2395,79 @@ define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: mov h2, v1.h[2] -; NONEON-NOSVE-NEXT: fcvt s3, h1 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #24] -; NONEON-NOSVE-NEXT: mov h5, v1.h[3] -; NONEON-NOSVE-NEXT: mov h7, v0.h[2] -; NONEON-NOSVE-NEXT: mov h16, v0.h[3] -; NONEON-NOSVE-NEXT: fcvt s6, h0 -; NONEON-NOSVE-NEXT: mov h0, v0.h[1] -; NONEON-NOSVE-NEXT: mov h1, v1.h[1] -; NONEON-NOSVE-NEXT: fcvt s17, h4 -; NONEON-NOSVE-NEXT: mov h18, v4.h[2] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvtzs x8, s3 -; NONEON-NOSVE-NEXT: fcvt s3, h5 -; NONEON-NOSVE-NEXT: fcvt s5, h7 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: mov h16, v4.h[3] -; NONEON-NOSVE-NEXT: fcvtzs x9, s6 -; NONEON-NOSVE-NEXT: ldr d6, [sp, #8] -; NONEON-NOSVE-NEXT: fcvt s0, h0 -; NONEON-NOSVE-NEXT: fcvt s1, h1 -; NONEON-NOSVE-NEXT: mov h4, v4.h[1] -; NONEON-NOSVE-NEXT: fcvtzs x11, s2 -; NONEON-NOSVE-NEXT: mov h2, v6.h[2] -; NONEON-NOSVE-NEXT: fcvtzs x10, s17 -; NONEON-NOSVE-NEXT: fcvtzs x13, s5 -; NONEON-NOSVE-NEXT: fcvtzs x12, s3 -; NONEON-NOSVE-NEXT: mov h3, v6.h[3] -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov h5, v6.h[1] -; NONEON-NOSVE-NEXT: fcvt s17, h18 -; NONEON-NOSVE-NEXT: fcvtzs x14, s7 -; NONEON-NOSVE-NEXT: fmov d7, x8 +; NONEON-NOSVE-NEXT: sub sp, sp, #192 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 192 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s0, h0 ; NONEON-NOSVE-NEXT: fcvtzs x8, s0 -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fmov d0, x11 -; NONEON-NOSVE-NEXT: fcvtzs x11, s1 -; NONEON-NOSVE-NEXT: fmov d1, x13 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvtzs x13, s16 -; NONEON-NOSVE-NEXT: fmov d16, x9 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvtzs x15, s17 -; NONEON-NOSVE-NEXT: mov v0.d[1], x12 -; NONEON-NOSVE-NEXT: mov v1.d[1], x14 -; NONEON-NOSVE-NEXT: fcvtzs x9, s2 -; NONEON-NOSVE-NEXT: mov v16.d[1], x8 -; NONEON-NOSVE-NEXT: fcvtzs x8, s6 -; NONEON-NOSVE-NEXT: fcvtzs x14, s4 -; NONEON-NOSVE-NEXT: fcvtzs x12, s3 -; NONEON-NOSVE-NEXT: mov v7.d[1], x11 -; NONEON-NOSVE-NEXT: fmov d3, x10 -; NONEON-NOSVE-NEXT: fcvtzs x11, s5 -; NONEON-NOSVE-NEXT: fmov d2, x15 -; NONEON-NOSVE-NEXT: stp q16, q1, [x1, #64] -; NONEON-NOSVE-NEXT: fmov d1, x9 -; NONEON-NOSVE-NEXT: fmov d4, x8 -; NONEON-NOSVE-NEXT: stp q7, q0, [x1] -; NONEON-NOSVE-NEXT: mov v2.d[1], x13 -; NONEON-NOSVE-NEXT: mov v3.d[1], x14 -; NONEON-NOSVE-NEXT: mov v1.d[1], x12 -; NONEON-NOSVE-NEXT: mov v4.d[1], x11 -; NONEON-NOSVE-NEXT: stp q3, q2, [x1, #96] -; NONEON-NOSVE-NEXT: stp q4, q1, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #96] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #112] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #160] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #176] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #160] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #128] +; NONEON-NOSVE-NEXT: fcvtzs x9, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #144] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #192 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %res = fptosi <16 x half> %op1 to <16 x i64> @@ -1741,7 +2490,14 @@ define <2 x i16> @fcvtzs_v2f32_v2i16(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x float> %op1 to <2 x i16> ret <2 x i16> %res @@ -1759,8 +2515,20 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <4 x float> %op1 to <4 x i16> ret <4 x i16> %res @@ -1782,10 +2550,31 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptosi <8 x float> %op1 to <8 x i16> @@ -1814,15 +2603,56 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v16f32_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v3.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtzs v2.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: fcvtzs w8, s1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x float>, ptr %a %res = fptosi <16 x float> %op1 to <16 x i16> @@ -1845,7 +2675,14 @@ define <2 x i32> @fcvtzs_v2f32_v2i32(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x float> %op1 to <2 x i32> ret <2 x i32> %res @@ -1862,7 +2699,18 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <4 x float> %op1 to <4 x i32> ret <4 x i32> %res @@ -1880,10 +2728,28 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtzs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w9, s1 +; NONEON-NOSVE-NEXT: fcvtzs w8, s0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptosi <8 x float> %op1 to <8 x i32> @@ -1907,9 +2773,13 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v1f32_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x float> %op1 to <1 x i64> ret <1 x i64> %res @@ -1927,8 +2797,15 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f32_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x float> %op1 to <2 x i64> ret <2 x i64> %res @@ -1950,15 +2827,21 @@ define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v4f32_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %res = fptosi <4 x float> %op1 to <4 x i64> @@ -1988,21 +2871,33 @@ define void @fcvtzs_v8f32_v8i64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: fcvtzs_v8f32_v8i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: fcvtl v1.2d, v1.2s -; NONEON-NOSVE-NEXT: fcvtl v0.2d, v0.2s -; NONEON-NOSVE-NEXT: fcvtl v2.2d, v2.2s -; NONEON-NOSVE-NEXT: fcvtl v3.2d, v3.2s -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d -; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #112] +; NONEON-NOSVE-NEXT: fcvtzs x9, s1 +; NONEON-NOSVE-NEXT: fcvtzs x8, s0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %res = fptosi <8 x float> %op1 to <8 x i64> @@ -2026,8 +2921,12 @@ define <1 x i16> @fcvtzs_v1f64_v1i16(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fcvtzs w8, d0 -; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x double> %op1 to <1 x i16> ret <1 x i16> %res @@ -2045,8 +2944,14 @@ define <2 x i16> @fcvtzs_v2f64_v2i16(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x double> %op1 to <2 x i16> ret <2 x i16> %res @@ -2079,11 +2984,27 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #78] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: strh w9, [sp, #74] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #72] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptosi <4 x double> %op1 to <4 x i16> @@ -2131,19 +3052,49 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] -; NONEON-NOSVE-NEXT: adrp x8, .LCPI61_0 -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d -; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d -; NONEON-NOSVE-NEXT: xtn v7.2s, v0.2d -; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI61_0] -; NONEON-NOSVE-NEXT: xtn v6.2s, v1.2d -; NONEON-NOSVE-NEXT: xtn v5.2s, v2.2d -; NONEON-NOSVE-NEXT: xtn v4.2s, v3.2d -; NONEON-NOSVE-NEXT: tbl v0.16b, { v4.16b, v5.16b, v6.16b, v7.16b }, v0.16b +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #72] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #104] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: str d2, [sp, #120] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #142] +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: str d0, [sp, #96] +; NONEON-NOSVE-NEXT: strh w9, [sp, #138] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: strh w9, [sp, #134] +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: strh w9, [sp, #130] +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #128] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptosi <8 x double> %op1 to <8 x i16> @@ -2224,31 +3175,90 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v16f64_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #96] -; NONEON-NOSVE-NEXT: adrp x8, .LCPI62_0 -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q4, q5, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d -; NONEON-NOSVE-NEXT: fcvtzs v5.2d, v5.2d -; NONEON-NOSVE-NEXT: fcvtzs v4.2d, v4.2d -; NONEON-NOSVE-NEXT: fcvtzs v6.2d, v6.2d -; NONEON-NOSVE-NEXT: fcvtzs v7.2d, v7.2d -; NONEON-NOSVE-NEXT: xtn v19.2s, v0.2d -; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI62_0] -; NONEON-NOSVE-NEXT: xtn v23.2s, v3.2d -; NONEON-NOSVE-NEXT: xtn v18.2s, v1.2d -; NONEON-NOSVE-NEXT: xtn v22.2s, v2.2d -; NONEON-NOSVE-NEXT: xtn v17.2s, v5.2d -; NONEON-NOSVE-NEXT: xtn v21.2s, v6.2d -; NONEON-NOSVE-NEXT: xtn v16.2s, v4.2d -; NONEON-NOSVE-NEXT: xtn v20.2s, v7.2d -; NONEON-NOSVE-NEXT: tbl v1.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v0.16b -; NONEON-NOSVE-NEXT: tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: sub sp, sp, #304 +; NONEON-NOSVE-NEXT: str x29, [sp, #288] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 304 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #288] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q6, q7, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0, #96] +; NONEON-NOSVE-NEXT: stp q1, q7, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q6, q4, [sp] +; NONEON-NOSVE-NEXT: stp q5, q3, [sp, #32] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #176] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #168] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #232] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #192] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #232] +; NONEON-NOSVE-NEXT: str d2, [sp, #248] +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #144] +; NONEON-NOSVE-NEXT: strh w9, [sp, #270] +; NONEON-NOSVE-NEXT: strh w8, [sp, #268] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #248] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #200] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #160] +; NONEON-NOSVE-NEXT: strh w9, [sp, #266] +; NONEON-NOSVE-NEXT: strh w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #240] +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #216] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #192] +; NONEON-NOSVE-NEXT: strh w9, [sp, #262] +; NONEON-NOSVE-NEXT: strh w8, [sp, #260] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #200] +; NONEON-NOSVE-NEXT: str d0, [sp, #296] +; NONEON-NOSVE-NEXT: strh w9, [sp, #258] +; NONEON-NOSVE-NEXT: strh w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #216] +; NONEON-NOSVE-NEXT: strh w9, [sp, #286] +; NONEON-NOSVE-NEXT: strh w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #208] +; NONEON-NOSVE-NEXT: strh w9, [sp, #282] +; NONEON-NOSVE-NEXT: strh w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #300] +; NONEON-NOSVE-NEXT: strh w9, [sp, #278] +; NONEON-NOSVE-NEXT: strh w8, [sp, #274] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #296] +; NONEON-NOSVE-NEXT: strh w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #256] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #304 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x double>, ptr %a %res = fptosi <16 x double> %op1 to <16 x i16> @@ -2272,9 +3282,12 @@ define <1 x i32> @fcvtzs_v1f64_v1i32(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x double> %op1 to <1 x i32> ret <1 x i32> %res @@ -2292,8 +3305,14 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x double> %op1 to <2 x i32> ret <2 x i32> %res @@ -2315,10 +3334,19 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptosi <4 x double> %op1 to <4 x i32> @@ -2347,15 +3375,32 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v8f64_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v3.2d, v3.2d -; NONEON-NOSVE-NEXT: fcvtzs v2.2d, v2.2d -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: fcvtzs w9, d1 +; NONEON-NOSVE-NEXT: fcvtzs w8, d0 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x double>, ptr %a %res = fptosi <8 x double> %op1 to <8 x i32> @@ -2378,8 +3423,12 @@ define <1 x i64> @fcvtzs_v1f64_v1i64(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v1f64_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fcvtzs x8, d0 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = fptosi <1 x double> %op1 to <1 x i64> ret <1 x i64> %res @@ -2396,7 +3445,14 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v2f64_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: fcvtzs x9, d1 +; NONEON-NOSVE-NEXT: fcvtzs x8, d0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = fptosi <2 x double> %op1 to <2 x i64> ret <2 x i64> %res @@ -2414,10 +3470,20 @@ define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fcvtzs_v4f64_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fcvtzs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtzs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvtzs x9, d1 +; NONEON-NOSVE-NEXT: fcvtzs x8, d0 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvtzs x9, d1 +; NONEON-NOSVE-NEXT: fcvtzs x8, d0 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %res = fptosi <4 x double> %op1 to <4 x i64> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll index d3b09374676556..69661049bcb6f3 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -31,10 +31,27 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, <2 x i1> %mask ; ; NONEON-NOSVE-LABEL: select_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uzp1 v2.4h, v2.4h, v0.4h -; NONEON-NOSVE-NEXT: shl v2.4h, v2.4h, #15 -; NONEON-NOSVE-NEXT: cmlt v2.4h, v2.4h, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: str w10, [sp, #28] +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x half> %op1, <2 x half> %op2 ret <2 x half> %sel @@ -57,9 +74,40 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask ; ; NONEON-NOSVE-LABEL: select_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.4h, v2.4h, #15 -; NONEON-NOSVE-NEXT: cmlt v2.4h, v2.4h, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #18] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: sbfx w9, w11, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2 ret <4 x half> %sel @@ -83,10 +131,68 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask ; ; NONEON-NOSVE-LABEL: select_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: shl v2.8h, v2.8h, #15 -; NONEON-NOSVE-NEXT: cmlt v2.8h, v2.8h, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #47] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #45] +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: tst w13, #0xffff +; NONEON-NOSVE-NEXT: sbfx w13, w15, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: tst w13, #0xffff +; NONEON-NOSVE-NEXT: sbfx w13, w14, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: tst w13, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: tst w12, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: tst w11, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: tst w10, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0xffff +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2 ret <8 x half> %sel @@ -107,122 +213,126 @@ define void @select_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: mov h2, v1.h[1] -; NONEON-NOSVE-NEXT: mov h3, v0.h[1] -; NONEON-NOSVE-NEXT: mov h4, v1.h[2] -; NONEON-NOSVE-NEXT: mov h5, v0.h[2] -; NONEON-NOSVE-NEXT: fcvt s6, h1 -; NONEON-NOSVE-NEXT: fcvt s7, h0 -; NONEON-NOSVE-NEXT: mov h16, v1.h[6] -; NONEON-NOSVE-NEXT: mov h17, v0.h[6] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: fcvt s17, h17 +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h4, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h5, [sp, #4] +; NONEON-NOSVE-NEXT: ldr h16, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s2, h0 +; NONEON-NOSVE-NEXT: fcvt s3, h1 +; NONEON-NOSVE-NEXT: ldr h17, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s6, h4 +; NONEON-NOSVE-NEXT: fcvt s7, h5 +; NONEON-NOSVE-NEXT: ldr h19, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s18, h17 +; NONEON-NOSVE-NEXT: ldr h21, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h22, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s20, h19 +; NONEON-NOSVE-NEXT: ldr h24, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h25, [sp, #34] ; NONEON-NOSVE-NEXT: fcmp s3, s2 -; NONEON-NOSVE-NEXT: mov h2, v1.h[3] -; NONEON-NOSVE-NEXT: mov h3, v0.h[3] -; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: fcvt s2, h16 +; NONEON-NOSVE-NEXT: ldr h3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h26, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h27, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h28, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h29, [sp, #44] +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, eq ; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[4] -; NONEON-NOSVE-NEXT: mov h7, v0.h[4] -; NONEON-NOSVE-NEXT: fcvt s2, h2 -; NONEON-NOSVE-NEXT: fcvt s3, h3 -; NONEON-NOSVE-NEXT: csetm w14, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v1.h[5] -; NONEON-NOSVE-NEXT: mov h5, v0.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w12, eq -; NONEON-NOSVE-NEXT: fcmp s3, s2 -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w11, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v1.h[7] -; NONEON-NOSVE-NEXT: mov h7, v0.h[7] -; NONEON-NOSVE-NEXT: mov h18, v3.h[3] -; NONEON-NOSVE-NEXT: csetm w13, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: mov h4, v3.h[1] -; NONEON-NOSVE-NEXT: mov h5, v2.h[1] -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: csetm w9, eq +; NONEON-NOSVE-NEXT: fcvt s7, h3 +; NONEON-NOSVE-NEXT: ldr h6, [sp, #26] +; NONEON-NOSVE-NEXT: fcsel s1, s5, s4, eq +; NONEON-NOSVE-NEXT: fcmp s18, s2 +; NONEON-NOSVE-NEXT: fcvt s4, h6 +; NONEON-NOSVE-NEXT: fcvt s18, h21 +; NONEON-NOSVE-NEXT: ldr h5, [sp, #28] +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: fcsel s2, s17, s16, eq +; NONEON-NOSVE-NEXT: fcmp s20, s7 +; NONEON-NOSVE-NEXT: fcvt s16, h5 +; NONEON-NOSVE-NEXT: fcvt s17, h22 +; NONEON-NOSVE-NEXT: ldr h7, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h20, [sp, #14] +; NONEON-NOSVE-NEXT: str h1, [sp, #68] +; NONEON-NOSVE-NEXT: fcsel s3, s19, s3, eq +; NONEON-NOSVE-NEXT: fcmp s18, s4 +; NONEON-NOSVE-NEXT: fcvt s19, h7 +; NONEON-NOSVE-NEXT: fcvt s23, h20 +; NONEON-NOSVE-NEXT: ldr h18, [sp, #48] +; NONEON-NOSVE-NEXT: str h2, [sp, #70] +; NONEON-NOSVE-NEXT: fcsel s4, s21, s6, eq ; NONEON-NOSVE-NEXT: fcmp s17, s16 -; NONEON-NOSVE-NEXT: mov h16, v3.h[2] -; NONEON-NOSVE-NEXT: fcvt s4, h4 -; NONEON-NOSVE-NEXT: mov h17, v2.h[2] -; NONEON-NOSVE-NEXT: fcvt s5, h5 -; NONEON-NOSVE-NEXT: csetm w10, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: fcvt s6, h3 -; NONEON-NOSVE-NEXT: fcvt s7, h2 -; NONEON-NOSVE-NEXT: csetm w15, eq -; NONEON-NOSVE-NEXT: fcmp s5, s4 -; NONEON-NOSVE-NEXT: fmov s4, w14 -; NONEON-NOSVE-NEXT: csetm w16, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v2.h[3] -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: fcvt s16, h17 -; NONEON-NOSVE-NEXT: mov v4.h[1], w8 ; NONEON-NOSVE-NEXT: fcvt s17, h18 -; NONEON-NOSVE-NEXT: csetm w14, eq -; NONEON-NOSVE-NEXT: fmov s5, w14 -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcmp s16, s7 -; NONEON-NOSVE-NEXT: mov h7, v3.h[4] -; NONEON-NOSVE-NEXT: mov h16, v2.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[2], w12 -; NONEON-NOSVE-NEXT: mov v5.h[1], w16 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s6, s17 -; NONEON-NOSVE-NEXT: mov h17, v2.h[5] -; NONEON-NOSVE-NEXT: fcvt s6, h7 -; NONEON-NOSVE-NEXT: fcvt s7, h16 -; NONEON-NOSVE-NEXT: mov h16, v3.h[5] -; NONEON-NOSVE-NEXT: mov v4.h[3], w11 -; NONEON-NOSVE-NEXT: mov v5.h[2], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcvt s17, h17 -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov h6, v3.h[6] -; NONEON-NOSVE-NEXT: mov h7, v2.h[6] -; NONEON-NOSVE-NEXT: fcvt s16, h16 -; NONEON-NOSVE-NEXT: mov v4.h[4], w13 -; NONEON-NOSVE-NEXT: mov v5.h[3], w8 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcvt s6, h6 -; NONEON-NOSVE-NEXT: fcvt s7, h7 -; NONEON-NOSVE-NEXT: fcmp s17, s16 -; NONEON-NOSVE-NEXT: mov h16, v3.h[7] -; NONEON-NOSVE-NEXT: mov h17, v2.h[7] -; NONEON-NOSVE-NEXT: mov v5.h[4], w8 -; NONEON-NOSVE-NEXT: mov v4.h[5], w9 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: fcvt s6, h16 -; NONEON-NOSVE-NEXT: fcvt s7, h17 -; NONEON-NOSVE-NEXT: mov v5.h[5], w8 -; NONEON-NOSVE-NEXT: mov v4.h[6], w10 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: fcmp s7, s6 -; NONEON-NOSVE-NEXT: mov v5.h[6], w8 -; NONEON-NOSVE-NEXT: mov v4.h[7], w15 -; NONEON-NOSVE-NEXT: csetm w8, eq -; NONEON-NOSVE-NEXT: mov v5.h[7], w8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: fcvt s21, h24 +; NONEON-NOSVE-NEXT: ldr h16, [sp, #50] +; NONEON-NOSVE-NEXT: str h3, [sp, #72] +; NONEON-NOSVE-NEXT: fcsel s5, s22, s5, eq +; NONEON-NOSVE-NEXT: fcmp s23, s19 +; NONEON-NOSVE-NEXT: fcvt s22, h16 +; NONEON-NOSVE-NEXT: fcvt s23, h25 +; NONEON-NOSVE-NEXT: ldr h19, [sp, #52] +; NONEON-NOSVE-NEXT: str h4, [sp, #74] +; NONEON-NOSVE-NEXT: fcsel s6, s20, s7, eq +; NONEON-NOSVE-NEXT: fcmp s21, s17 +; NONEON-NOSVE-NEXT: fcvt s20, h19 +; NONEON-NOSVE-NEXT: fcvt s21, h26 +; NONEON-NOSVE-NEXT: ldr h17, [sp, #54] +; NONEON-NOSVE-NEXT: str h5, [sp, #76] +; NONEON-NOSVE-NEXT: fcsel s7, s24, s18, eq +; NONEON-NOSVE-NEXT: fcmp s23, s22 +; NONEON-NOSVE-NEXT: fcvt s22, h17 +; NONEON-NOSVE-NEXT: fcvt s23, h27 +; NONEON-NOSVE-NEXT: ldr h18, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h24, [sp, #40] +; NONEON-NOSVE-NEXT: str h6, [sp, #78] +; NONEON-NOSVE-NEXT: fcsel s16, s25, s16, eq +; NONEON-NOSVE-NEXT: fcmp s21, s20 +; NONEON-NOSVE-NEXT: fcvt s21, h18 +; NONEON-NOSVE-NEXT: fcvt s25, h24 +; NONEON-NOSVE-NEXT: ldr h20, [sp, #58] +; NONEON-NOSVE-NEXT: str h7, [sp, #80] +; NONEON-NOSVE-NEXT: fcsel s19, s26, s19, eq +; NONEON-NOSVE-NEXT: fcmp s23, s22 +; NONEON-NOSVE-NEXT: fcvt s23, h20 +; NONEON-NOSVE-NEXT: fcvt s26, h28 +; NONEON-NOSVE-NEXT: ldr h22, [sp, #60] +; NONEON-NOSVE-NEXT: str h16, [sp, #82] +; NONEON-NOSVE-NEXT: fcsel s17, s27, s17, eq +; NONEON-NOSVE-NEXT: fcmp s25, s21 +; NONEON-NOSVE-NEXT: fcvt s25, h22 +; NONEON-NOSVE-NEXT: fcvt s27, h29 +; NONEON-NOSVE-NEXT: ldr h21, [sp, #62] +; NONEON-NOSVE-NEXT: str h19, [sp, #84] +; NONEON-NOSVE-NEXT: fcsel s18, s24, s18, eq +; NONEON-NOSVE-NEXT: ldr h24, [sp, #46] +; NONEON-NOSVE-NEXT: fcmp s26, s23 +; NONEON-NOSVE-NEXT: fcvt s23, h21 +; NONEON-NOSVE-NEXT: str h17, [sp, #86] +; NONEON-NOSVE-NEXT: fcvt s26, h24 +; NONEON-NOSVE-NEXT: fcsel s20, s28, s20, eq +; NONEON-NOSVE-NEXT: fcmp s27, s25 +; NONEON-NOSVE-NEXT: ldr h25, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h27, [sp] +; NONEON-NOSVE-NEXT: str h18, [sp, #88] +; NONEON-NOSVE-NEXT: fcvt s17, h25 +; NONEON-NOSVE-NEXT: fcvt s18, h27 +; NONEON-NOSVE-NEXT: fcsel s7, s29, s22, eq +; NONEON-NOSVE-NEXT: fcmp s26, s23 +; NONEON-NOSVE-NEXT: str h20, [sp, #90] +; NONEON-NOSVE-NEXT: fcsel s16, s24, s21, eq +; NONEON-NOSVE-NEXT: str h7, [sp, #92] +; NONEON-NOSVE-NEXT: fcmp s18, s17 +; NONEON-NOSVE-NEXT: str h16, [sp, #94] +; NONEON-NOSVE-NEXT: fcsel s2, s27, s25, eq +; NONEON-NOSVE-NEXT: str h2, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -249,9 +359,22 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %m ; ; NONEON-NOSVE-LABEL: select_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.2s, v2.2s, #31 -; NONEON-NOSVE-NEXT: cmlt v2.2s, v2.2s, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: str d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: sbfx w8, w9, #0, #1 +; NONEON-NOSVE-NEXT: fcsel s3, s2, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2 ret <2 x float> %sel @@ -275,10 +398,36 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m ; ; NONEON-NOSVE-LABEL: select_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: shl v2.4s, v2.4s, #31 -; NONEON-NOSVE-NEXT: cmlt v2.4s, v2.4s, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #12] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: sbfx w9, w11, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: fcsel s3, s2, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #56] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: fcsel s3, s2, s0, ne +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2 ret <4 x float> %sel @@ -299,14 +448,45 @@ define void @select_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: fcmeq v4.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcmeq v5.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #20] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #4] +; NONEON-NOSVE-NEXT: ldr s4, [sp, #12] +; NONEON-NOSVE-NEXT: ldr s17, [sp] +; NONEON-NOSVE-NEXT: ldp s6, s7, [sp, #36] +; NONEON-NOSVE-NEXT: fcmp s1, s0 +; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, eq +; NONEON-NOSVE-NEXT: fcmp s3, s2 +; NONEON-NOSVE-NEXT: ldp s1, s5, [sp, #28] +; NONEON-NOSVE-NEXT: fcsel s2, s3, s2, eq +; NONEON-NOSVE-NEXT: ldp s16, s3, [sp, #44] +; NONEON-NOSVE-NEXT: fcmp s4, s1 +; NONEON-NOSVE-NEXT: fcsel s1, s4, s1, eq +; NONEON-NOSVE-NEXT: fcmp s5, s3 +; NONEON-NOSVE-NEXT: ldr s4, [sp, #52] +; NONEON-NOSVE-NEXT: fcsel s3, s5, s3, eq +; NONEON-NOSVE-NEXT: fcmp s6, s4 +; NONEON-NOSVE-NEXT: ldr s5, [sp, #56] +; NONEON-NOSVE-NEXT: stp s2, s1, [sp, #72] +; NONEON-NOSVE-NEXT: fcsel s4, s6, s4, eq +; NONEON-NOSVE-NEXT: fcmp s7, s5 +; NONEON-NOSVE-NEXT: ldr s6, [sp, #60] +; NONEON-NOSVE-NEXT: fcsel s5, s7, s5, eq +; NONEON-NOSVE-NEXT: fcmp s16, s6 +; NONEON-NOSVE-NEXT: ldr s7, [sp, #16] +; NONEON-NOSVE-NEXT: stp s3, s4, [sp, #80] +; NONEON-NOSVE-NEXT: fcsel s6, s16, s6, eq +; NONEON-NOSVE-NEXT: fcmp s17, s7 +; NONEON-NOSVE-NEXT: fcsel s3, s17, s7, eq +; NONEON-NOSVE-NEXT: stp s5, s6, [sp, #88] +; NONEON-NOSVE-NEXT: stp s3, s0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -325,10 +505,13 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> ; ; NONEON-NOSVE-LABEL: select_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: fmov d2, x8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: fcsel d0, d0, d1, ne +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2 ret <1 x double> %sel @@ -352,10 +535,23 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> ; ; NONEON-NOSVE-LABEL: select_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: shl v2.2d, v2.2d, #63 -; NONEON-NOSVE-NEXT: cmlt v2.2d, v2.2d, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: sbfx x8, x8, #0, #1 +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: sbfx x8, x9, #0, #1 +; NONEON-NOSVE-NEXT: fcsel d3, d2, d0, ne +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, ne +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2 ret <2 x double> %sel @@ -376,14 +572,29 @@ define void @select_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: fcmeq v4.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: fcmeq v5.2d, v2.2d, v3.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d5, d1, [sp] +; NONEON-NOSVE-NEXT: ldp d0, d3, [sp, #24] +; NONEON-NOSVE-NEXT: ldp d4, d2, [sp, #40] +; NONEON-NOSVE-NEXT: fcmp d1, d0 +; NONEON-NOSVE-NEXT: fcsel d0, d1, d0, eq +; NONEON-NOSVE-NEXT: fcmp d3, d2 +; NONEON-NOSVE-NEXT: ldr d1, [sp, #56] +; NONEON-NOSVE-NEXT: fcsel d2, d3, d2, eq +; NONEON-NOSVE-NEXT: fcmp d4, d1 +; NONEON-NOSVE-NEXT: ldr d3, [sp, #16] +; NONEON-NOSVE-NEXT: fcsel d1, d4, d1, eq +; NONEON-NOSVE-NEXT: fcmp d5, d3 +; NONEON-NOSVE-NEXT: fcsel d3, d5, d3, eq +; NONEON-NOSVE-NEXT: stp d2, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp d3, d0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll index ae97a266c6ff0d..3ba61c3335a64c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll @@ -25,10 +25,21 @@ define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x i8> %op1, i8 5, i64 3 ret <4 x i8> %r @@ -50,10 +61,23 @@ define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.b[7], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <8 x i8> %op1, i8 5, i64 7 ret <8 x i8> %r @@ -75,8 +99,25 @@ define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v16i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.b[15], w8 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <16 x i8> %op1, i8 5, i64 15 ret <16 x i8> %r @@ -98,8 +139,25 @@ define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v32i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v1.b[15], w8 +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <32 x i8> %op1, i8 5, i64 31 ret <32 x i8> %r @@ -122,10 +180,18 @@ define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.s[1], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x i16> %op1, i16 5, i64 1 ret <2 x i16> %r @@ -147,10 +213,21 @@ define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x i16> %op1, i16 5, i64 3 ret <4 x i16> %r @@ -172,8 +249,23 @@ define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v8i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.h[7], w8 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <8 x i16> %op1, i16 5, i64 7 ret <8 x i16> %r @@ -195,8 +287,23 @@ define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v1.h[7], w8 +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <16 x i16> %op1, i16 5, i64 15 ret <16 x i16> %r @@ -219,10 +326,18 @@ define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.s[1], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x i32> %op1, i32 5, i64 1 ret <2 x i32> %r @@ -244,8 +359,20 @@ define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v4i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x i32> %op1, i32 5, i64 3 ret <4 x i32> %r @@ -267,9 +394,20 @@ define <8 x i32> @insertelement_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: insertelement_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %r = insertelement <8 x i32> %op1, i32 5, i64 7 @@ -286,8 +424,12 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = insertelement <1 x i64> %op1, i64 5, i64 0 ret <1 x i64> %r @@ -309,8 +451,18 @@ define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v2i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v0.d[1], x8 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x i64> %op1, i64 5, i64 1 ret <2 x i64> %r @@ -332,9 +484,18 @@ define <4 x i64> @insertelement_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: insertelement_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: mov w8, #5 // =0x5 -; NONEON-NOSVE-NEXT: mov v1.d[1], x8 +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %r = insertelement <4 x i64> %op1, i64 5, i64 3 @@ -358,11 +519,14 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) { ; NONEON-NOSVE-LABEL: insertelement_v2f16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: adrp x8, .LCPI14_0 -; NONEON-NOSVE-NEXT: add x8, x8, :lo12:.LCPI14_0 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: ld1r { v1.4h }, [x8] -; NONEON-NOSVE-NEXT: mov v1.h[0], v0.h[0] -; NONEON-NOSVE-NEXT: fmov d0, d1 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [x8, :lo12:.LCPI14_0] +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: str h1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x half> %op1, half 5.0, i64 1 ret <2 x half> %r @@ -384,11 +548,22 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 ; NONEON-NOSVE-NEXT: adrp x8, .LCPI15_0 -; NONEON-NOSVE-NEXT: add x8, x8, :lo12:.LCPI15_0 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[3], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [x8, :lo12:.LCPI15_0] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: str h1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: str w8, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x half> %op1, half 5.0, i64 3 ret <4 x half> %r @@ -410,9 +585,24 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v8f16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: adrp x8, .LCPI16_0 -; NONEON-NOSVE-NEXT: add x8, x8, :lo12:.LCPI16_0 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[7], [x8] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [x8, :lo12:.LCPI16_0] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str h1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <8 x half> %op1, half 5.0, i64 7 ret <8 x half> %r @@ -434,10 +624,24 @@ define <16 x half> @insertelement_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: insertelement_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: adrp x8, .LCPI17_0 -; NONEON-NOSVE-NEXT: add x8, x8, :lo12:.LCPI17_0 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[7], [x8] +; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] +; NONEON-NOSVE-NEXT: ldr h1, [x8, :lo12:.LCPI17_0] +; NONEON-NOSVE-NEXT: str h1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str h1, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str h1, [sp, #46] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %r = insertelement <16 x half> %op1, half 5.0, i64 15 @@ -461,10 +665,18 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov s1, #5.00000000 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: mov v0.s[1], v1.s[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: mov w8, #1084227584 // =0x40a00000 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr s1, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x float> %op1, float 5.0, i64 1 ret <2 x float> %r @@ -486,8 +698,20 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov s1, #5.00000000 -; NONEON-NOSVE-NEXT: mov v0.s[3], v1.s[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: mov w8, #1084227584 // =0x40a00000 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <4 x float> %op1, float 5.0, i64 3 ret <4 x float> %r @@ -509,9 +733,21 @@ define <8 x float> @insertelement_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: insertelement_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov s2, #5.00000000 -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: mov v1.s[3], v2.s[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: mov w8, #1084227584 // =0x40a00000 +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %r = insertelement <8 x float> %op1, float 5.0, i64 7 @@ -527,8 +763,12 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: mov x8, #4617315517961601024 // =0x4014000000000000 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %r = insertelement <1 x double> %op1, double 5.0, i64 0 ret <1 x double> %r @@ -550,8 +790,18 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) { ; ; NONEON-NOSVE-LABEL: insertelement_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov d1, #5.00000000 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: mov x8, #4617315517961601024 // =0x4014000000000000 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d1, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %r = insertelement <2 x double> %op1, double 5.0, i64 1 ret <2 x double> %r @@ -573,10 +823,19 @@ define <4 x double> @insertelement_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: insertelement_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov d0, #5.00000000 +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: mov x8, #4617315517961601024 // =0x4014000000000000 ; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] -; NONEON-NOSVE-NEXT: mov v1.d[1], v0.d[0] +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldr d1, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #32] ; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %r = insertelement <4 x double> %op1, double 5.0, i64 3 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll index 1b438559e05380..a2875ffef2e88a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll @@ -20,7 +20,27 @@ define <4 x i8> @add_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: add_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = add <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -37,7 +57,43 @@ define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: add_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = add <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -54,7 +110,74 @@ define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: add_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = add <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -72,11 +195,143 @@ define void @add_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -96,7 +351,18 @@ define <2 x i16> @add_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: add_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = add <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -113,7 +379,27 @@ define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: add_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = add <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -130,7 +416,42 @@ define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: add_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = add <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -148,11 +469,79 @@ define void @add_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -172,7 +561,18 @@ define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: add_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = add <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -189,7 +589,24 @@ define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: add_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = add <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -207,11 +624,43 @@ define void @add_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: add v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -231,7 +680,14 @@ define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: add_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = add <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -248,7 +704,17 @@ define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: add_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: add v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: add x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = add <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -266,11 +732,29 @@ define void @add_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: add v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: add x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: add x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -303,7 +787,27 @@ define <4 x i8> @mul_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = mul <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -329,7 +833,43 @@ define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = mul <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -355,7 +895,74 @@ define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = mul <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -384,11 +991,143 @@ define void @mul_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: mul_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: mul v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: mul v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -417,7 +1156,17 @@ define <2 x i16> @mul_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = mul <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -443,7 +1192,27 @@ define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = mul <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -469,7 +1238,42 @@ define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = mul <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -498,11 +1302,79 @@ define void @mul_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: mul_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: mul v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: mul v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -531,7 +1403,17 @@ define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = mul <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -557,7 +1439,22 @@ define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mul v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = mul <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -586,11 +1483,39 @@ define void @mul_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: mul_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: mul v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: mul v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: mul w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -619,12 +1544,14 @@ define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d1 ; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: mul x8, x9, x8 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = mul <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -650,14 +1577,16 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: mul_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x10, d1 -; NONEON-NOSVE-NEXT: fmov x11, d0 -; NONEON-NOSVE-NEXT: mov x8, v1.d[1] -; NONEON-NOSVE-NEXT: mov x9, v0.d[1] -; NONEON-NOSVE-NEXT: mul x10, x11, x10 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: mul x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: mul x8, x9, x8 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: mov v0.d[1], x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = mul <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -686,25 +1615,27 @@ define void @mul_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: mul_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: fmov x12, d2 -; NONEON-NOSVE-NEXT: mov x11, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: mov x10, v3.d[1] -; NONEON-NOSVE-NEXT: mov x13, v1.d[1] -; NONEON-NOSVE-NEXT: mov x14, v0.d[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: mul x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] ; NONEON-NOSVE-NEXT: mul x8, x9, x8 -; NONEON-NOSVE-NEXT: fmov x9, d3 -; NONEON-NOSVE-NEXT: mul x10, x11, x10 -; NONEON-NOSVE-NEXT: mul x9, x12, x9 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: mul x11, x14, x13 -; NONEON-NOSVE-NEXT: fmov d0, x9 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 -; NONEON-NOSVE-NEXT: mov v0.d[1], x10 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: mul x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: mul x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -728,7 +1659,27 @@ define <4 x i8> @sub_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sub <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -745,7 +1696,43 @@ define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sub <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -762,7 +1749,74 @@ define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sub <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -780,11 +1834,143 @@ define void @sub_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sub_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: sub v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: sub v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -804,7 +1990,18 @@ define <2 x i16> @sub_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sub <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -821,7 +2018,27 @@ define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sub <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -838,7 +2055,42 @@ define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sub <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -856,11 +2108,79 @@ define void @sub_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sub_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: sub v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: sub v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -880,7 +2200,18 @@ define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sub <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -897,7 +2228,24 @@ define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sub <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -915,11 +2263,43 @@ define void @sub_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sub_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: sub v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: sub v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -939,7 +2319,14 @@ define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sub <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -956,7 +2343,17 @@ define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: sub_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: sub x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sub <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -974,11 +2371,29 @@ define void @sub_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sub_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: sub v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: sub v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: sub x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: sub x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -1003,9 +2418,26 @@ define <4 x i8> @abs_v4i8(<4 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: abs v0.4h, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #2] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: cneg w8, w9, mi +; NONEON-NOSVE-NEXT: cmp w10, #0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: cneg w8, w10, mi +; NONEON-NOSVE-NEXT: cmp w11, #0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: cneg w8, w11, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.abs.v4i8(<4 x i8> %op1, i1 false) ret <4 x i8> %res @@ -1022,7 +2454,42 @@ define <8 x i8> @abs_v8i8(<8 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false) ret <8 x i8> %res @@ -1039,7 +2506,74 @@ define <16 x i8> @abs_v16i8(<16 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false) ret <16 x i8> %res @@ -1057,10 +2591,140 @@ define void @abs_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: abs_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: abs v0.16b, v0.16b -; NONEON-NOSVE-NEXT: abs v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false) @@ -1080,9 +2744,17 @@ define <2 x i16> @abs_v2i16(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: abs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: cneg w9, w9, mi +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %op1, i1 false) ret <2 x i16> %res @@ -1099,7 +2771,26 @@ define <4 x i16> @abs_v4i16(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.4h, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false) ret <4 x i16> %res @@ -1116,7 +2807,42 @@ define <8 x i16> @abs_v8i16(<8 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.8h, v0.8h +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false) ret <8 x i16> %res @@ -1134,10 +2860,76 @@ define void @abs_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: abs_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: abs v0.8h, v0.8h -; NONEON-NOSVE-NEXT: abs v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false) @@ -1156,7 +2948,17 @@ define <2 x i32> @abs_v2i32(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false) ret <2 x i32> %res @@ -1173,7 +2975,24 @@ define <4 x i32> @abs_v4i32(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false) ret <4 x i32> %res @@ -1191,10 +3010,40 @@ define void @abs_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: abs_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: abs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: abs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false) @@ -1213,7 +3062,14 @@ define <1 x i64> @abs_v1i64(<1 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs d0, d0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false) ret <1 x i64> %res @@ -1230,7 +3086,17 @@ define <2 x i64> @abs_v2i64(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: abs_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: abs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x9, x8, mi +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false) ret <2 x i64> %res @@ -1248,10 +3114,26 @@ define void @abs_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: abs_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: abs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: abs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x9, x8, mi +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x9, x8, mi +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll index ee0ca0e60b5e51..0b4316686fff64 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll @@ -22,7 +22,51 @@ define <8 x i8> @icmp_eq_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <8 x i8> %op1, %op2 %sext = sext <8 x i1> %cmp to <8 x i8> @@ -42,7 +86,90 @@ define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <16 x i8> %op1, %op2 %sext = sext <16 x i1> %cmp to <16 x i8> @@ -64,11 +191,175 @@ define void @icmp_eq_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmeq v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: cmeq v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -91,7 +382,31 @@ define <4 x i16> @icmp_eq_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <4 x i16> %op1, %op2 %sext = sext <4 x i1> %cmp to <4 x i16> @@ -111,7 +426,50 @@ define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <8 x i16> %op1, %op2 %sext = sext <8 x i1> %cmp to <8 x i16> @@ -133,11 +491,95 @@ define void @icmp_eq_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmeq v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: cmeq v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -160,7 +602,19 @@ define <2 x i32> @icmp_eq_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <2 x i32> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i32> @@ -180,7 +634,26 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <4 x i32> %op1, %op2 %sext = sext <4 x i1> %cmp to <4 x i32> @@ -202,11 +675,47 @@ define void @icmp_eq_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmeq v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: cmeq v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm w10, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -229,7 +738,15 @@ define <1 x i64> @icmp_eq_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <1 x i64> %op1, %op2 %sext = sext <1 x i1> %cmp to <1 x i64> @@ -249,7 +766,18 @@ define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmeq v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x10, eq +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %cmp = icmp eq <2 x i64> %op1, %op2 %sext = sext <2 x i1> %cmp to <2 x i64> @@ -271,11 +799,31 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmeq v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: cmeq v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: csetm x10, eq +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #80] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x10, eq +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, eq +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -304,13 +852,175 @@ define void @icmp_ne_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_ne_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmeq v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: cmeq v1.16b, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b -; NONEON-NOSVE-NEXT: mvn v1.16b, v1.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -337,10 +1047,53 @@ define void @icmp_sge_v8i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_sge_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: cmge v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, ge +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %op2 = load <8 x i16>, ptr %b @@ -369,11 +1122,95 @@ define void @icmp_sgt_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_sgt_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmgt v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: cmgt v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -400,10 +1237,29 @@ define void @icmp_sle_v4i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_sle_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: cmge v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csetm w10, le +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm w10, le +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, le +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %op2 = load <4 x i32>, ptr %b @@ -432,11 +1288,47 @@ define void @icmp_slt_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_slt_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmgt v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: cmgt v1.4s, v3.4s, v2.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: csetm w10, lt +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: csetm w10, lt +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csetm w10, lt +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm w10, lt +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csetm w8, lt +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -463,10 +1355,21 @@ define void @icmp_uge_v2i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_uge_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: cmhs v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x10, hs +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, hs +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b @@ -493,10 +1396,21 @@ define void @icmp_ugt_v2i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_ugt_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: cmhi v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x10, hi +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, hi +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b @@ -523,10 +1437,21 @@ define void @icmp_ule_v2i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_ule_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: cmhs v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x10, ls +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, ls +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b @@ -553,10 +1478,21 @@ define void @icmp_ult_v2i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: icmp_ult_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: cmhi v0.2d, v1.2d, v0.2d +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x10, lo +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csetm x8, lo +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll index d79d6c18ed5a6e..e09b1613a54afb 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -28,27 +28,27 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w10, v0.h[0] -; NONEON-NOSVE-NEXT: smov w11, v0.h[2] -; NONEON-NOSVE-NEXT: smov w12, v0.h[3] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.h[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #8] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #20] ; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.h[2] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] ; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s0, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w8, w12, w11 -; NONEON-NOSVE-NEXT: mov v0.h[2], w10 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #16] +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: strh w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -80,41 +80,43 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: smov w8, v1.b[1] -; NONEON-NOSVE-NEXT: smov w9, v0.b[1] -; NONEON-NOSVE-NEXT: smov w10, v0.b[0] -; NONEON-NOSVE-NEXT: smov w11, v0.b[2] -; NONEON-NOSVE-NEXT: smov w12, v0.b[3] -; NONEON-NOSVE-NEXT: smov w13, v0.b[4] -; NONEON-NOSVE-NEXT: smov w14, v0.b[5] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.b[0] -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[2] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v1.b[3] -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: smov w9, v1.b[6] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: smov w12, v1.b[4] -; NONEON-NOSVE-NEXT: mov v2.b[2], w10 -; NONEON-NOSVE-NEXT: smov w10, v0.b[6] -; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.b[5] -; NONEON-NOSVE-NEXT: mov v2.b[3], w11 -; NONEON-NOSVE-NEXT: smov w11, v0.b[7] -; NONEON-NOSVE-NEXT: sdiv w8, w14, w13 -; NONEON-NOSVE-NEXT: mov v2.b[4], w12 -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[7] -; NONEON-NOSVE-NEXT: mov v2.b[5], w8 -; NONEON-NOSVE-NEXT: sdiv w8, w11, w10 -; NONEON-NOSVE-NEXT: mov v2.b[6], w9 -; NONEON-NOSVE-NEXT: mov v2.b[7], w8 -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -166,71 +168,74 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smov w8, v1.b[1] -; NONEON-NOSVE-NEXT: smov w9, v0.b[1] -; NONEON-NOSVE-NEXT: smov w10, v0.b[0] -; NONEON-NOSVE-NEXT: smov w11, v0.b[2] -; NONEON-NOSVE-NEXT: smov w12, v0.b[3] -; NONEON-NOSVE-NEXT: smov w13, v0.b[4] -; NONEON-NOSVE-NEXT: smov w14, v0.b[5] -; NONEON-NOSVE-NEXT: smov w15, v0.b[6] -; NONEON-NOSVE-NEXT: smov w16, v0.b[7] -; NONEON-NOSVE-NEXT: smov w17, v0.b[8] -; NONEON-NOSVE-NEXT: smov w18, v0.b[9] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.b[0] -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[2] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v1.b[3] -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: smov w9, v1.b[10] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: smov w12, v1.b[4] -; NONEON-NOSVE-NEXT: mov v2.b[2], w10 -; NONEON-NOSVE-NEXT: smov w10, v0.b[10] -; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.b[5] -; NONEON-NOSVE-NEXT: mov v2.b[3], w11 -; NONEON-NOSVE-NEXT: smov w11, v0.b[11] -; NONEON-NOSVE-NEXT: sdiv w13, w14, w13 -; NONEON-NOSVE-NEXT: smov w14, v1.b[6] -; NONEON-NOSVE-NEXT: mov v2.b[4], w12 -; NONEON-NOSVE-NEXT: smov w12, v0.b[12] -; NONEON-NOSVE-NEXT: sdiv w14, w15, w14 -; NONEON-NOSVE-NEXT: smov w15, v1.b[7] -; NONEON-NOSVE-NEXT: mov v2.b[5], w13 -; NONEON-NOSVE-NEXT: smov w13, v0.b[13] -; NONEON-NOSVE-NEXT: sdiv w15, w16, w15 -; NONEON-NOSVE-NEXT: smov w16, v1.b[8] -; NONEON-NOSVE-NEXT: mov v2.b[6], w14 -; NONEON-NOSVE-NEXT: sdiv w16, w17, w16 -; NONEON-NOSVE-NEXT: smov w17, v1.b[9] -; NONEON-NOSVE-NEXT: mov v2.b[7], w15 -; NONEON-NOSVE-NEXT: sdiv w8, w18, w17 -; NONEON-NOSVE-NEXT: mov v2.b[8], w16 -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[11] -; NONEON-NOSVE-NEXT: mov v2.b[9], w8 -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v1.b[12] -; NONEON-NOSVE-NEXT: mov v2.b[10], w9 -; NONEON-NOSVE-NEXT: smov w9, v1.b[14] -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: smov w12, v1.b[13] -; NONEON-NOSVE-NEXT: mov v2.b[11], w10 -; NONEON-NOSVE-NEXT: smov w10, v1.b[15] -; NONEON-NOSVE-NEXT: sdiv w8, w13, w12 -; NONEON-NOSVE-NEXT: smov w12, v0.b[14] -; NONEON-NOSVE-NEXT: mov v2.b[12], w11 -; NONEON-NOSVE-NEXT: smov w11, v0.b[15] -; NONEON-NOSVE-NEXT: sdiv w9, w12, w9 -; NONEON-NOSVE-NEXT: mov v2.b[13], w8 -; NONEON-NOSVE-NEXT: sdiv w8, w11, w10 -; NONEON-NOSVE-NEXT: mov v2.b[14], w9 -; NONEON-NOSVE-NEXT: mov v2.b[15], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sdiv <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -315,159 +320,143 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sdiv_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str x27, [sp, #-80]! // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -80 -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: smov w8, v1.b[1] -; NONEON-NOSVE-NEXT: smov w9, v0.b[1] -; NONEON-NOSVE-NEXT: smov w10, v0.b[0] -; NONEON-NOSVE-NEXT: smov w11, v0.b[2] -; NONEON-NOSVE-NEXT: smov w12, v0.b[3] -; NONEON-NOSVE-NEXT: smov w13, v0.b[4] -; NONEON-NOSVE-NEXT: smov w14, v0.b[5] -; NONEON-NOSVE-NEXT: smov w15, v0.b[6] -; NONEON-NOSVE-NEXT: smov w17, v0.b[8] -; NONEON-NOSVE-NEXT: smov w2, v0.b[10] -; NONEON-NOSVE-NEXT: smov w3, v0.b[11] -; NONEON-NOSVE-NEXT: smov w4, v0.b[12] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.b[0] -; NONEON-NOSVE-NEXT: smov w5, v0.b[13] -; NONEON-NOSVE-NEXT: smov w6, v0.b[14] -; NONEON-NOSVE-NEXT: smov w1, v3.b[1] -; NONEON-NOSVE-NEXT: smov w7, v2.b[0] -; NONEON-NOSVE-NEXT: smov w19, v2.b[2] -; NONEON-NOSVE-NEXT: smov w20, v2.b[3] -; NONEON-NOSVE-NEXT: smov w21, v2.b[4] -; NONEON-NOSVE-NEXT: smov w22, v2.b[5] -; NONEON-NOSVE-NEXT: smov w23, v2.b[6] -; NONEON-NOSVE-NEXT: smov w24, v2.b[7] -; NONEON-NOSVE-NEXT: smov w25, v2.b[8] -; NONEON-NOSVE-NEXT: smov w26, v2.b[9] -; NONEON-NOSVE-NEXT: smov w27, v2.b[10] -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[2] -; NONEON-NOSVE-NEXT: sdiv w11, w11, w10 -; NONEON-NOSVE-NEXT: smov w10, v1.b[3] -; NONEON-NOSVE-NEXT: fmov s5, w9 -; NONEON-NOSVE-NEXT: smov w9, v3.b[11] -; NONEON-NOSVE-NEXT: mov v5.b[1], w8 -; NONEON-NOSVE-NEXT: sdiv w10, w12, w10 -; NONEON-NOSVE-NEXT: smov w12, v1.b[4] -; NONEON-NOSVE-NEXT: mov v5.b[2], w11 -; NONEON-NOSVE-NEXT: smov w11, v2.b[11] -; NONEON-NOSVE-NEXT: sdiv w13, w13, w12 -; NONEON-NOSVE-NEXT: smov w12, v1.b[5] -; NONEON-NOSVE-NEXT: mov v5.b[3], w10 -; NONEON-NOSVE-NEXT: smov w10, v3.b[12] -; NONEON-NOSVE-NEXT: sdiv w12, w14, w12 -; NONEON-NOSVE-NEXT: smov w14, v1.b[6] -; NONEON-NOSVE-NEXT: mov v5.b[4], w13 -; NONEON-NOSVE-NEXT: smov w13, v2.b[14] -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 -; NONEON-NOSVE-NEXT: smov w14, v1.b[7] -; NONEON-NOSVE-NEXT: smov w15, v0.b[7] -; NONEON-NOSVE-NEXT: mov v5.b[5], w12 -; NONEON-NOSVE-NEXT: smov w12, v2.b[13] -; NONEON-NOSVE-NEXT: sdiv w14, w15, w14 -; NONEON-NOSVE-NEXT: smov w15, v1.b[8] -; NONEON-NOSVE-NEXT: mov v5.b[6], w16 -; NONEON-NOSVE-NEXT: sdiv w18, w17, w15 -; NONEON-NOSVE-NEXT: smov w15, v1.b[9] -; NONEON-NOSVE-NEXT: smov w17, v0.b[9] -; NONEON-NOSVE-NEXT: mov v5.b[7], w14 -; NONEON-NOSVE-NEXT: sdiv w17, w17, w15 -; NONEON-NOSVE-NEXT: smov w15, v1.b[10] -; NONEON-NOSVE-NEXT: mov v5.b[8], w18 -; NONEON-NOSVE-NEXT: sdiv w15, w2, w15 -; NONEON-NOSVE-NEXT: smov w2, v1.b[11] -; NONEON-NOSVE-NEXT: mov v5.b[9], w17 -; NONEON-NOSVE-NEXT: sdiv w2, w3, w2 -; NONEON-NOSVE-NEXT: smov w3, v1.b[12] -; NONEON-NOSVE-NEXT: mov v5.b[10], w15 -; NONEON-NOSVE-NEXT: sdiv w3, w4, w3 -; NONEON-NOSVE-NEXT: smov w4, v1.b[13] -; NONEON-NOSVE-NEXT: mov v5.b[11], w2 -; NONEON-NOSVE-NEXT: sdiv w4, w5, w4 -; NONEON-NOSVE-NEXT: smov w5, v1.b[14] -; NONEON-NOSVE-NEXT: mov v5.b[12], w3 -; NONEON-NOSVE-NEXT: sdiv w5, w6, w5 -; NONEON-NOSVE-NEXT: smov w6, v2.b[1] -; NONEON-NOSVE-NEXT: mov v5.b[13], w4 -; NONEON-NOSVE-NEXT: sdiv w1, w6, w1 -; NONEON-NOSVE-NEXT: smov w6, v3.b[0] -; NONEON-NOSVE-NEXT: mov v5.b[14], w5 -; NONEON-NOSVE-NEXT: sdiv w6, w7, w6 -; NONEON-NOSVE-NEXT: smov w7, v3.b[2] -; NONEON-NOSVE-NEXT: sdiv w7, w19, w7 -; NONEON-NOSVE-NEXT: smov w19, v3.b[3] -; NONEON-NOSVE-NEXT: fmov s4, w6 -; NONEON-NOSVE-NEXT: mov v4.b[1], w1 -; NONEON-NOSVE-NEXT: sdiv w19, w20, w19 -; NONEON-NOSVE-NEXT: smov w20, v3.b[4] -; NONEON-NOSVE-NEXT: mov v4.b[2], w7 -; NONEON-NOSVE-NEXT: sdiv w20, w21, w20 -; NONEON-NOSVE-NEXT: smov w21, v3.b[5] -; NONEON-NOSVE-NEXT: mov v4.b[3], w19 -; NONEON-NOSVE-NEXT: sdiv w21, w22, w21 -; NONEON-NOSVE-NEXT: smov w22, v3.b[6] -; NONEON-NOSVE-NEXT: mov v4.b[4], w20 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w22, w23, w22 -; NONEON-NOSVE-NEXT: smov w23, v3.b[7] -; NONEON-NOSVE-NEXT: mov v4.b[5], w21 -; NONEON-NOSVE-NEXT: sdiv w23, w24, w23 -; NONEON-NOSVE-NEXT: smov w24, v3.b[8] -; NONEON-NOSVE-NEXT: mov v4.b[6], w22 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w24, w25, w24 -; NONEON-NOSVE-NEXT: smov w25, v3.b[9] -; NONEON-NOSVE-NEXT: mov v4.b[7], w23 -; NONEON-NOSVE-NEXT: sdiv w25, w26, w25 -; NONEON-NOSVE-NEXT: smov w26, v3.b[10] -; NONEON-NOSVE-NEXT: mov v4.b[8], w24 -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w8, w27, w26 -; NONEON-NOSVE-NEXT: mov v4.b[9], w25 -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w9, w11, w9 -; NONEON-NOSVE-NEXT: smov w11, v2.b[12] -; NONEON-NOSVE-NEXT: mov v4.b[10], w8 -; NONEON-NOSVE-NEXT: smov w8, v3.b[15] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v3.b[13] -; NONEON-NOSVE-NEXT: mov v4.b[11], w9 -; NONEON-NOSVE-NEXT: smov w9, v1.b[15] -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: smov w12, v3.b[14] -; NONEON-NOSVE-NEXT: mov v4.b[12], w10 -; NONEON-NOSVE-NEXT: smov w10, v0.b[15] -; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 -; NONEON-NOSVE-NEXT: smov w13, v2.b[15] -; NONEON-NOSVE-NEXT: mov v4.b[13], w11 -; NONEON-NOSVE-NEXT: sdiv w8, w13, w8 -; NONEON-NOSVE-NEXT: mov v4.b[14], w12 -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: mov v4.b[15], w8 -; NONEON-NOSVE-NEXT: mov v5.b[15], w9 -; NONEON-NOSVE-NEXT: stp q4, q5, [x0] -; NONEON-NOSVE-NEXT: ldr x27, [sp], #80 // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #47] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #62] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #61] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #60] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #59] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #58] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #57] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #55] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #54] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #53] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #52] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #51] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #50] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #49] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -490,19 +479,18 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: shl v1.2s, v1.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: mov w10, v0.s[1] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: mov w9, v1.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #8] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: mov v0.s[1], w9 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -523,25 +511,27 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w10, v0.h[0] -; NONEON-NOSVE-NEXT: smov w11, v0.h[2] -; NONEON-NOSVE-NEXT: smov w12, v0.h[3] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.h[0] -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.h[2] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s0, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w8, w12, w11 -; NONEON-NOSVE-NEXT: mov v0.h[2], w10 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -572,39 +562,42 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w10, v0.h[0] -; NONEON-NOSVE-NEXT: smov w11, v0.h[2] -; NONEON-NOSVE-NEXT: smov w12, v0.h[3] -; NONEON-NOSVE-NEXT: smov w13, v0.h[4] -; NONEON-NOSVE-NEXT: smov w14, v0.h[5] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.h[0] -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.h[2] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: smov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: smov w9, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: smov w12, v1.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[2], w10 -; NONEON-NOSVE-NEXT: smov w10, v0.h[6] -; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.h[5] -; NONEON-NOSVE-NEXT: mov v2.h[3], w11 -; NONEON-NOSVE-NEXT: smov w11, v0.h[7] -; NONEON-NOSVE-NEXT: sdiv w8, w14, w13 -; NONEON-NOSVE-NEXT: mov v2.h[4], w12 -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: sdiv w8, w11, w10 -; NONEON-NOSVE-NEXT: mov v2.h[6], w9 -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sdiv <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -649,75 +642,79 @@ define void @sdiv_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sdiv_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w10, v0.h[0] -; NONEON-NOSVE-NEXT: smov w11, v0.h[2] -; NONEON-NOSVE-NEXT: smov w12, v0.h[3] -; NONEON-NOSVE-NEXT: smov w13, v0.h[4] -; NONEON-NOSVE-NEXT: smov w14, v0.h[5] -; NONEON-NOSVE-NEXT: smov w15, v0.h[6] -; NONEON-NOSVE-NEXT: smov w16, v2.h[1] -; NONEON-NOSVE-NEXT: smov w17, v2.h[0] -; NONEON-NOSVE-NEXT: smov w18, v2.h[2] -; NONEON-NOSVE-NEXT: smov w1, v2.h[3] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v1.h[0] -; NONEON-NOSVE-NEXT: smov w2, v2.h[4] -; NONEON-NOSVE-NEXT: smov w3, v2.h[5] -; NONEON-NOSVE-NEXT: smov w4, v2.h[6] -; NONEON-NOSVE-NEXT: sdiv w10, w10, w9 -; NONEON-NOSVE-NEXT: smov w9, v1.h[2] -; NONEON-NOSVE-NEXT: sdiv w9, w11, w9 -; NONEON-NOSVE-NEXT: smov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s5, w10 -; NONEON-NOSVE-NEXT: smov w10, v3.h[7] -; NONEON-NOSVE-NEXT: mov v5.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: smov w12, v1.h[4] -; NONEON-NOSVE-NEXT: mov v5.h[2], w9 -; NONEON-NOSVE-NEXT: smov w9, v2.h[7] -; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.h[5] -; NONEON-NOSVE-NEXT: mov v5.h[3], w11 -; NONEON-NOSVE-NEXT: smov w11, v0.h[7] -; NONEON-NOSVE-NEXT: sdiv w13, w14, w13 -; NONEON-NOSVE-NEXT: smov w14, v1.h[6] -; NONEON-NOSVE-NEXT: mov v5.h[4], w12 -; NONEON-NOSVE-NEXT: sdiv w14, w15, w14 -; NONEON-NOSVE-NEXT: smov w15, v3.h[1] -; NONEON-NOSVE-NEXT: mov v5.h[5], w13 -; NONEON-NOSVE-NEXT: sdiv w15, w16, w15 -; NONEON-NOSVE-NEXT: smov w16, v3.h[0] -; NONEON-NOSVE-NEXT: mov v5.h[6], w14 -; NONEON-NOSVE-NEXT: sdiv w16, w17, w16 -; NONEON-NOSVE-NEXT: smov w17, v3.h[2] -; NONEON-NOSVE-NEXT: sdiv w17, w18, w17 -; NONEON-NOSVE-NEXT: smov w18, v3.h[3] -; NONEON-NOSVE-NEXT: fmov s4, w16 -; NONEON-NOSVE-NEXT: mov v4.h[1], w15 -; NONEON-NOSVE-NEXT: sdiv w18, w1, w18 -; NONEON-NOSVE-NEXT: smov w1, v3.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[2], w17 -; NONEON-NOSVE-NEXT: sdiv w1, w2, w1 -; NONEON-NOSVE-NEXT: smov w2, v3.h[5] -; NONEON-NOSVE-NEXT: mov v4.h[3], w18 -; NONEON-NOSVE-NEXT: sdiv w2, w3, w2 -; NONEON-NOSVE-NEXT: smov w3, v3.h[6] -; NONEON-NOSVE-NEXT: mov v4.h[4], w1 -; NONEON-NOSVE-NEXT: sdiv w8, w4, w3 -; NONEON-NOSVE-NEXT: mov v4.h[5], w2 -; NONEON-NOSVE-NEXT: sdiv w9, w9, w10 -; NONEON-NOSVE-NEXT: smov w10, v1.h[7] -; NONEON-NOSVE-NEXT: mov v4.h[6], w8 -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: mov v4.h[7], w9 -; NONEON-NOSVE-NEXT: mov v5.h[7], w10 -; NONEON-NOSVE-NEXT: stp q4, q5, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #58] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #54] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #50] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -738,17 +735,17 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: mov w10, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: mov w9, v1.s[1] -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: mov v0.s[1], w9 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -766,22 +763,22 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w10, s0 -; NONEON-NOSVE-NEXT: mov w11, v0.s[2] -; NONEON-NOSVE-NEXT: mov w12, v0.s[3] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] ; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: fmov w9, s1 -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: mov w10, v1.s[2] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: mov w11, v1.s[3] -; NONEON-NOSVE-NEXT: fmov s0, w9 -; NONEON-NOSVE-NEXT: mov v0.s[1], w8 -; NONEON-NOSVE-NEXT: sdiv w8, w12, w11 -; NONEON-NOSVE-NEXT: mov v0.s[2], w10 -; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -801,41 +798,39 @@ define void @sdiv_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sdiv_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w10, s0 -; NONEON-NOSVE-NEXT: mov w11, v0.s[2] -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w12, v2.s[1] -; NONEON-NOSVE-NEXT: fmov w13, s2 -; NONEON-NOSVE-NEXT: mov w14, v2.s[2] -; NONEON-NOSVE-NEXT: mov w15, v2.s[3] -; NONEON-NOSVE-NEXT: mov w16, v0.s[3] -; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 -; NONEON-NOSVE-NEXT: fmov w9, s1 -; NONEON-NOSVE-NEXT: sdiv w9, w10, w9 -; NONEON-NOSVE-NEXT: mov w10, v1.s[2] -; NONEON-NOSVE-NEXT: sdiv w10, w11, w10 -; NONEON-NOSVE-NEXT: mov w11, v3.s[1] -; NONEON-NOSVE-NEXT: sdiv w11, w12, w11 -; NONEON-NOSVE-NEXT: fmov w12, s3 -; NONEON-NOSVE-NEXT: sdiv w12, w13, w12 -; NONEON-NOSVE-NEXT: mov w13, v3.s[2] -; NONEON-NOSVE-NEXT: sdiv w13, w14, w13 -; NONEON-NOSVE-NEXT: mov w14, v3.s[3] -; NONEON-NOSVE-NEXT: fmov s0, w12 -; NONEON-NOSVE-NEXT: mov v0.s[1], w11 -; NONEON-NOSVE-NEXT: sdiv w14, w15, w14 -; NONEON-NOSVE-NEXT: mov w15, v1.s[3] -; NONEON-NOSVE-NEXT: fmov s1, w9 -; NONEON-NOSVE-NEXT: mov v0.s[2], w13 -; NONEON-NOSVE-NEXT: mov v1.s[1], w8 -; NONEON-NOSVE-NEXT: mov v1.s[2], w10 -; NONEON-NOSVE-NEXT: sdiv w8, w16, w15 -; NONEON-NOSVE-NEXT: mov v0.s[3], w14 -; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -856,12 +851,14 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d1 ; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: sdiv x8, x9, x8 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -879,14 +876,16 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: sdiv_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x10, v0.d[1] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv x8, x9, x8 -; NONEON-NOSVE-NEXT: mov x9, v1.d[1] -; NONEON-NOSVE-NEXT: sdiv x9, x10, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -906,25 +905,27 @@ define void @sdiv_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x10, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x11, d2 -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: mov x12, v0.d[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] ; NONEON-NOSVE-NEXT: sdiv x8, x9, x8 -; NONEON-NOSVE-NEXT: mov x9, v3.d[1] -; NONEON-NOSVE-NEXT: sdiv x9, x10, x9 -; NONEON-NOSVE-NEXT: fmov x10, d3 -; NONEON-NOSVE-NEXT: sdiv x10, x11, x10 -; NONEON-NOSVE-NEXT: mov x11, v1.d[1] -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: sdiv x11, x12, x11 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -954,33 +955,27 @@ define <4 x i8> @udiv_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w10, v0.h[0] -; NONEON-NOSVE-NEXT: umov w11, v0.h[2] -; NONEON-NOSVE-NEXT: umov w12, v0.h[3] -; NONEON-NOSVE-NEXT: and w8, w8, #0xff -; NONEON-NOSVE-NEXT: and w9, w9, #0xff -; NONEON-NOSVE-NEXT: and w10, w10, #0xff -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.h[0] -; NONEON-NOSVE-NEXT: and w11, w11, #0xff -; NONEON-NOSVE-NEXT: and w9, w9, #0xff +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #8] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #20] ; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.h[2] -; NONEON-NOSVE-NEXT: and w10, w10, #0xff +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] ; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s0, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: and w9, w11, #0xff -; NONEON-NOSVE-NEXT: and w11, w12, #0xff -; NONEON-NOSVE-NEXT: udiv w8, w11, w9 -; NONEON-NOSVE-NEXT: mov v0.h[2], w10 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #16] +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w11, w12, w11 +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: strh w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = udiv <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -1012,41 +1007,43 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w8, v1.b[1] -; NONEON-NOSVE-NEXT: umov w9, v0.b[1] -; NONEON-NOSVE-NEXT: umov w10, v0.b[0] -; NONEON-NOSVE-NEXT: umov w11, v0.b[2] -; NONEON-NOSVE-NEXT: umov w12, v0.b[3] -; NONEON-NOSVE-NEXT: umov w13, v0.b[4] -; NONEON-NOSVE-NEXT: umov w14, v0.b[5] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.b[0] -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[2] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v1.b[3] -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: umov w9, v1.b[6] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: umov w12, v1.b[4] -; NONEON-NOSVE-NEXT: mov v2.b[2], w10 -; NONEON-NOSVE-NEXT: umov w10, v0.b[6] -; NONEON-NOSVE-NEXT: udiv w12, w13, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.b[5] -; NONEON-NOSVE-NEXT: mov v2.b[3], w11 -; NONEON-NOSVE-NEXT: umov w11, v0.b[7] -; NONEON-NOSVE-NEXT: udiv w8, w14, w13 -; NONEON-NOSVE-NEXT: mov v2.b[4], w12 -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[7] -; NONEON-NOSVE-NEXT: mov v2.b[5], w8 -; NONEON-NOSVE-NEXT: udiv w8, w11, w10 -; NONEON-NOSVE-NEXT: mov v2.b[6], w9 -; NONEON-NOSVE-NEXT: mov v2.b[7], w8 -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = udiv <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -1098,71 +1095,74 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umov w8, v1.b[1] -; NONEON-NOSVE-NEXT: umov w9, v0.b[1] -; NONEON-NOSVE-NEXT: umov w10, v0.b[0] -; NONEON-NOSVE-NEXT: umov w11, v0.b[2] -; NONEON-NOSVE-NEXT: umov w12, v0.b[3] -; NONEON-NOSVE-NEXT: umov w13, v0.b[4] -; NONEON-NOSVE-NEXT: umov w14, v0.b[5] -; NONEON-NOSVE-NEXT: umov w15, v0.b[6] -; NONEON-NOSVE-NEXT: umov w16, v0.b[7] -; NONEON-NOSVE-NEXT: umov w17, v0.b[8] -; NONEON-NOSVE-NEXT: umov w18, v0.b[9] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.b[0] -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[2] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v1.b[3] -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: umov w9, v1.b[10] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: umov w12, v1.b[4] -; NONEON-NOSVE-NEXT: mov v2.b[2], w10 -; NONEON-NOSVE-NEXT: umov w10, v0.b[10] -; NONEON-NOSVE-NEXT: udiv w12, w13, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.b[5] -; NONEON-NOSVE-NEXT: mov v2.b[3], w11 -; NONEON-NOSVE-NEXT: umov w11, v0.b[11] -; NONEON-NOSVE-NEXT: udiv w13, w14, w13 -; NONEON-NOSVE-NEXT: umov w14, v1.b[6] -; NONEON-NOSVE-NEXT: mov v2.b[4], w12 -; NONEON-NOSVE-NEXT: umov w12, v0.b[12] -; NONEON-NOSVE-NEXT: udiv w14, w15, w14 -; NONEON-NOSVE-NEXT: umov w15, v1.b[7] -; NONEON-NOSVE-NEXT: mov v2.b[5], w13 -; NONEON-NOSVE-NEXT: umov w13, v0.b[13] -; NONEON-NOSVE-NEXT: udiv w15, w16, w15 -; NONEON-NOSVE-NEXT: umov w16, v1.b[8] -; NONEON-NOSVE-NEXT: mov v2.b[6], w14 -; NONEON-NOSVE-NEXT: udiv w16, w17, w16 -; NONEON-NOSVE-NEXT: umov w17, v1.b[9] -; NONEON-NOSVE-NEXT: mov v2.b[7], w15 -; NONEON-NOSVE-NEXT: udiv w8, w18, w17 -; NONEON-NOSVE-NEXT: mov v2.b[8], w16 -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[11] -; NONEON-NOSVE-NEXT: mov v2.b[9], w8 -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v1.b[12] -; NONEON-NOSVE-NEXT: mov v2.b[10], w9 -; NONEON-NOSVE-NEXT: umov w9, v1.b[14] -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: umov w12, v1.b[13] -; NONEON-NOSVE-NEXT: mov v2.b[11], w10 -; NONEON-NOSVE-NEXT: umov w10, v1.b[15] -; NONEON-NOSVE-NEXT: udiv w8, w13, w12 -; NONEON-NOSVE-NEXT: umov w12, v0.b[14] -; NONEON-NOSVE-NEXT: mov v2.b[12], w11 -; NONEON-NOSVE-NEXT: umov w11, v0.b[15] -; NONEON-NOSVE-NEXT: udiv w9, w12, w9 -; NONEON-NOSVE-NEXT: mov v2.b[13], w8 -; NONEON-NOSVE-NEXT: udiv w8, w11, w10 -; NONEON-NOSVE-NEXT: mov v2.b[14], w9 -; NONEON-NOSVE-NEXT: mov v2.b[15], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = udiv <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -1247,159 +1247,143 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: udiv_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str x27, [sp, #-80]! // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -80 -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: umov w8, v1.b[1] -; NONEON-NOSVE-NEXT: umov w9, v0.b[1] -; NONEON-NOSVE-NEXT: umov w10, v0.b[0] -; NONEON-NOSVE-NEXT: umov w11, v0.b[2] -; NONEON-NOSVE-NEXT: umov w12, v0.b[3] -; NONEON-NOSVE-NEXT: umov w13, v0.b[4] -; NONEON-NOSVE-NEXT: umov w14, v0.b[5] -; NONEON-NOSVE-NEXT: umov w15, v0.b[6] -; NONEON-NOSVE-NEXT: umov w17, v0.b[8] -; NONEON-NOSVE-NEXT: umov w2, v0.b[10] -; NONEON-NOSVE-NEXT: umov w3, v0.b[11] -; NONEON-NOSVE-NEXT: umov w4, v0.b[12] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.b[0] -; NONEON-NOSVE-NEXT: umov w5, v0.b[13] -; NONEON-NOSVE-NEXT: umov w6, v0.b[14] -; NONEON-NOSVE-NEXT: umov w1, v3.b[1] -; NONEON-NOSVE-NEXT: umov w7, v2.b[0] -; NONEON-NOSVE-NEXT: umov w19, v2.b[2] -; NONEON-NOSVE-NEXT: umov w20, v2.b[3] -; NONEON-NOSVE-NEXT: umov w21, v2.b[4] -; NONEON-NOSVE-NEXT: umov w22, v2.b[5] -; NONEON-NOSVE-NEXT: umov w23, v2.b[6] -; NONEON-NOSVE-NEXT: umov w24, v2.b[7] -; NONEON-NOSVE-NEXT: umov w25, v2.b[8] -; NONEON-NOSVE-NEXT: umov w26, v2.b[9] -; NONEON-NOSVE-NEXT: umov w27, v2.b[10] -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[2] -; NONEON-NOSVE-NEXT: udiv w11, w11, w10 -; NONEON-NOSVE-NEXT: umov w10, v1.b[3] -; NONEON-NOSVE-NEXT: fmov s5, w9 -; NONEON-NOSVE-NEXT: umov w9, v3.b[11] -; NONEON-NOSVE-NEXT: mov v5.b[1], w8 -; NONEON-NOSVE-NEXT: udiv w10, w12, w10 -; NONEON-NOSVE-NEXT: umov w12, v1.b[4] -; NONEON-NOSVE-NEXT: mov v5.b[2], w11 -; NONEON-NOSVE-NEXT: umov w11, v2.b[11] -; NONEON-NOSVE-NEXT: udiv w13, w13, w12 -; NONEON-NOSVE-NEXT: umov w12, v1.b[5] -; NONEON-NOSVE-NEXT: mov v5.b[3], w10 -; NONEON-NOSVE-NEXT: umov w10, v3.b[12] -; NONEON-NOSVE-NEXT: udiv w12, w14, w12 -; NONEON-NOSVE-NEXT: umov w14, v1.b[6] -; NONEON-NOSVE-NEXT: mov v5.b[4], w13 -; NONEON-NOSVE-NEXT: umov w13, v2.b[14] -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 -; NONEON-NOSVE-NEXT: umov w14, v1.b[7] -; NONEON-NOSVE-NEXT: umov w15, v0.b[7] -; NONEON-NOSVE-NEXT: mov v5.b[5], w12 -; NONEON-NOSVE-NEXT: umov w12, v2.b[13] -; NONEON-NOSVE-NEXT: udiv w14, w15, w14 -; NONEON-NOSVE-NEXT: umov w15, v1.b[8] -; NONEON-NOSVE-NEXT: mov v5.b[6], w16 -; NONEON-NOSVE-NEXT: udiv w18, w17, w15 -; NONEON-NOSVE-NEXT: umov w15, v1.b[9] -; NONEON-NOSVE-NEXT: umov w17, v0.b[9] -; NONEON-NOSVE-NEXT: mov v5.b[7], w14 -; NONEON-NOSVE-NEXT: udiv w17, w17, w15 -; NONEON-NOSVE-NEXT: umov w15, v1.b[10] -; NONEON-NOSVE-NEXT: mov v5.b[8], w18 -; NONEON-NOSVE-NEXT: udiv w15, w2, w15 -; NONEON-NOSVE-NEXT: umov w2, v1.b[11] -; NONEON-NOSVE-NEXT: mov v5.b[9], w17 -; NONEON-NOSVE-NEXT: udiv w2, w3, w2 -; NONEON-NOSVE-NEXT: umov w3, v1.b[12] -; NONEON-NOSVE-NEXT: mov v5.b[10], w15 -; NONEON-NOSVE-NEXT: udiv w3, w4, w3 -; NONEON-NOSVE-NEXT: umov w4, v1.b[13] -; NONEON-NOSVE-NEXT: mov v5.b[11], w2 -; NONEON-NOSVE-NEXT: udiv w4, w5, w4 -; NONEON-NOSVE-NEXT: umov w5, v1.b[14] -; NONEON-NOSVE-NEXT: mov v5.b[12], w3 -; NONEON-NOSVE-NEXT: udiv w5, w6, w5 -; NONEON-NOSVE-NEXT: umov w6, v2.b[1] -; NONEON-NOSVE-NEXT: mov v5.b[13], w4 -; NONEON-NOSVE-NEXT: udiv w1, w6, w1 -; NONEON-NOSVE-NEXT: umov w6, v3.b[0] -; NONEON-NOSVE-NEXT: mov v5.b[14], w5 -; NONEON-NOSVE-NEXT: udiv w6, w7, w6 -; NONEON-NOSVE-NEXT: umov w7, v3.b[2] -; NONEON-NOSVE-NEXT: udiv w7, w19, w7 -; NONEON-NOSVE-NEXT: umov w19, v3.b[3] -; NONEON-NOSVE-NEXT: fmov s4, w6 -; NONEON-NOSVE-NEXT: mov v4.b[1], w1 -; NONEON-NOSVE-NEXT: udiv w19, w20, w19 -; NONEON-NOSVE-NEXT: umov w20, v3.b[4] -; NONEON-NOSVE-NEXT: mov v4.b[2], w7 -; NONEON-NOSVE-NEXT: udiv w20, w21, w20 -; NONEON-NOSVE-NEXT: umov w21, v3.b[5] -; NONEON-NOSVE-NEXT: mov v4.b[3], w19 -; NONEON-NOSVE-NEXT: udiv w21, w22, w21 -; NONEON-NOSVE-NEXT: umov w22, v3.b[6] -; NONEON-NOSVE-NEXT: mov v4.b[4], w20 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w22, w23, w22 -; NONEON-NOSVE-NEXT: umov w23, v3.b[7] -; NONEON-NOSVE-NEXT: mov v4.b[5], w21 -; NONEON-NOSVE-NEXT: udiv w23, w24, w23 -; NONEON-NOSVE-NEXT: umov w24, v3.b[8] -; NONEON-NOSVE-NEXT: mov v4.b[6], w22 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w24, w25, w24 -; NONEON-NOSVE-NEXT: umov w25, v3.b[9] -; NONEON-NOSVE-NEXT: mov v4.b[7], w23 -; NONEON-NOSVE-NEXT: udiv w25, w26, w25 -; NONEON-NOSVE-NEXT: umov w26, v3.b[10] -; NONEON-NOSVE-NEXT: mov v4.b[8], w24 -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w8, w27, w26 -; NONEON-NOSVE-NEXT: mov v4.b[9], w25 -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w9, w11, w9 -; NONEON-NOSVE-NEXT: umov w11, v2.b[12] -; NONEON-NOSVE-NEXT: mov v4.b[10], w8 -; NONEON-NOSVE-NEXT: umov w8, v3.b[15] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v3.b[13] -; NONEON-NOSVE-NEXT: mov v4.b[11], w9 -; NONEON-NOSVE-NEXT: umov w9, v1.b[15] -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: umov w12, v3.b[14] -; NONEON-NOSVE-NEXT: mov v4.b[12], w10 -; NONEON-NOSVE-NEXT: umov w10, v0.b[15] -; NONEON-NOSVE-NEXT: udiv w12, w13, w12 -; NONEON-NOSVE-NEXT: umov w13, v2.b[15] -; NONEON-NOSVE-NEXT: mov v4.b[13], w11 -; NONEON-NOSVE-NEXT: udiv w8, w13, w8 -; NONEON-NOSVE-NEXT: mov v4.b[14], w12 -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: mov v4.b[15], w8 -; NONEON-NOSVE-NEXT: mov v5.b[15], w9 -; NONEON-NOSVE-NEXT: stp q4, q5, [x0] -; NONEON-NOSVE-NEXT: ldr x27, [sp], #80 // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -1422,18 +1406,18 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: mov w10, v0.s[1] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: mov w9, v1.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #8] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] ; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: mov v0.s[1], w9 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = udiv <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -1454,25 +1438,27 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w10, v0.h[0] -; NONEON-NOSVE-NEXT: umov w11, v0.h[2] -; NONEON-NOSVE-NEXT: umov w12, v0.h[3] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.h[0] -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.h[2] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s0, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: udiv w8, w12, w11 -; NONEON-NOSVE-NEXT: mov v0.h[2], w10 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = udiv <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -1503,39 +1489,42 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w10, v0.h[0] -; NONEON-NOSVE-NEXT: umov w11, v0.h[2] -; NONEON-NOSVE-NEXT: umov w12, v0.h[3] -; NONEON-NOSVE-NEXT: umov w13, v0.h[4] -; NONEON-NOSVE-NEXT: umov w14, v0.h[5] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.h[0] -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.h[2] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: umov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s2, w9 -; NONEON-NOSVE-NEXT: umov w9, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: umov w12, v1.h[4] -; NONEON-NOSVE-NEXT: mov v2.h[2], w10 -; NONEON-NOSVE-NEXT: umov w10, v0.h[6] -; NONEON-NOSVE-NEXT: udiv w12, w13, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.h[5] -; NONEON-NOSVE-NEXT: mov v2.h[3], w11 -; NONEON-NOSVE-NEXT: umov w11, v0.h[7] -; NONEON-NOSVE-NEXT: udiv w8, w14, w13 -; NONEON-NOSVE-NEXT: mov v2.h[4], w12 -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: udiv w8, w11, w10 -; NONEON-NOSVE-NEXT: mov v2.h[6], w9 -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = udiv <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -1580,75 +1569,79 @@ define void @udiv_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: udiv_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w10, v0.h[0] -; NONEON-NOSVE-NEXT: umov w11, v0.h[2] -; NONEON-NOSVE-NEXT: umov w12, v0.h[3] -; NONEON-NOSVE-NEXT: umov w13, v0.h[4] -; NONEON-NOSVE-NEXT: umov w14, v0.h[5] -; NONEON-NOSVE-NEXT: umov w15, v0.h[6] -; NONEON-NOSVE-NEXT: umov w16, v2.h[1] -; NONEON-NOSVE-NEXT: umov w17, v2.h[0] -; NONEON-NOSVE-NEXT: umov w18, v2.h[2] -; NONEON-NOSVE-NEXT: umov w1, v2.h[3] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v1.h[0] -; NONEON-NOSVE-NEXT: umov w2, v2.h[4] -; NONEON-NOSVE-NEXT: umov w3, v2.h[5] -; NONEON-NOSVE-NEXT: umov w4, v2.h[6] -; NONEON-NOSVE-NEXT: udiv w10, w10, w9 -; NONEON-NOSVE-NEXT: umov w9, v1.h[2] -; NONEON-NOSVE-NEXT: udiv w9, w11, w9 -; NONEON-NOSVE-NEXT: umov w11, v1.h[3] -; NONEON-NOSVE-NEXT: fmov s5, w10 -; NONEON-NOSVE-NEXT: umov w10, v3.h[7] -; NONEON-NOSVE-NEXT: mov v5.h[1], w8 -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: umov w12, v1.h[4] -; NONEON-NOSVE-NEXT: mov v5.h[2], w9 -; NONEON-NOSVE-NEXT: umov w9, v2.h[7] -; NONEON-NOSVE-NEXT: udiv w12, w13, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.h[5] -; NONEON-NOSVE-NEXT: mov v5.h[3], w11 -; NONEON-NOSVE-NEXT: umov w11, v0.h[7] -; NONEON-NOSVE-NEXT: udiv w13, w14, w13 -; NONEON-NOSVE-NEXT: umov w14, v1.h[6] -; NONEON-NOSVE-NEXT: mov v5.h[4], w12 -; NONEON-NOSVE-NEXT: udiv w14, w15, w14 -; NONEON-NOSVE-NEXT: umov w15, v3.h[1] -; NONEON-NOSVE-NEXT: mov v5.h[5], w13 -; NONEON-NOSVE-NEXT: udiv w15, w16, w15 -; NONEON-NOSVE-NEXT: umov w16, v3.h[0] -; NONEON-NOSVE-NEXT: mov v5.h[6], w14 -; NONEON-NOSVE-NEXT: udiv w16, w17, w16 -; NONEON-NOSVE-NEXT: umov w17, v3.h[2] -; NONEON-NOSVE-NEXT: udiv w17, w18, w17 -; NONEON-NOSVE-NEXT: umov w18, v3.h[3] -; NONEON-NOSVE-NEXT: fmov s4, w16 -; NONEON-NOSVE-NEXT: mov v4.h[1], w15 -; NONEON-NOSVE-NEXT: udiv w18, w1, w18 -; NONEON-NOSVE-NEXT: umov w1, v3.h[4] -; NONEON-NOSVE-NEXT: mov v4.h[2], w17 -; NONEON-NOSVE-NEXT: udiv w1, w2, w1 -; NONEON-NOSVE-NEXT: umov w2, v3.h[5] -; NONEON-NOSVE-NEXT: mov v4.h[3], w18 -; NONEON-NOSVE-NEXT: udiv w2, w3, w2 -; NONEON-NOSVE-NEXT: umov w3, v3.h[6] -; NONEON-NOSVE-NEXT: mov v4.h[4], w1 -; NONEON-NOSVE-NEXT: udiv w8, w4, w3 -; NONEON-NOSVE-NEXT: mov v4.h[5], w2 -; NONEON-NOSVE-NEXT: udiv w9, w9, w10 -; NONEON-NOSVE-NEXT: umov w10, v1.h[7] -; NONEON-NOSVE-NEXT: mov v4.h[6], w8 -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: mov v4.h[7], w9 -; NONEON-NOSVE-NEXT: mov v5.h[7], w10 -; NONEON-NOSVE-NEXT: stp q4, q5, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -1669,17 +1662,17 @@ define <2 x i32> @udiv_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: mov w10, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] ; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: mov w9, v1.s[1] -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: mov v0.s[1], w9 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = udiv <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -1697,22 +1690,22 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w10, s0 -; NONEON-NOSVE-NEXT: mov w11, v0.s[2] -; NONEON-NOSVE-NEXT: mov w12, v0.s[3] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] ; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: fmov w9, s1 -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: mov w10, v1.s[2] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: mov w11, v1.s[3] -; NONEON-NOSVE-NEXT: fmov s0, w9 -; NONEON-NOSVE-NEXT: mov v0.s[1], w8 -; NONEON-NOSVE-NEXT: udiv w8, w12, w11 -; NONEON-NOSVE-NEXT: mov v0.s[2], w10 -; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = udiv <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -1732,41 +1725,39 @@ define void @udiv_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: udiv_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w10, s0 -; NONEON-NOSVE-NEXT: mov w11, v0.s[2] -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w12, v2.s[1] -; NONEON-NOSVE-NEXT: fmov w13, s2 -; NONEON-NOSVE-NEXT: mov w14, v2.s[2] -; NONEON-NOSVE-NEXT: mov w15, v2.s[3] -; NONEON-NOSVE-NEXT: mov w16, v0.s[3] -; NONEON-NOSVE-NEXT: udiv w8, w9, w8 -; NONEON-NOSVE-NEXT: fmov w9, s1 -; NONEON-NOSVE-NEXT: udiv w9, w10, w9 -; NONEON-NOSVE-NEXT: mov w10, v1.s[2] -; NONEON-NOSVE-NEXT: udiv w10, w11, w10 -; NONEON-NOSVE-NEXT: mov w11, v3.s[1] -; NONEON-NOSVE-NEXT: udiv w11, w12, w11 -; NONEON-NOSVE-NEXT: fmov w12, s3 -; NONEON-NOSVE-NEXT: udiv w12, w13, w12 -; NONEON-NOSVE-NEXT: mov w13, v3.s[2] -; NONEON-NOSVE-NEXT: udiv w13, w14, w13 -; NONEON-NOSVE-NEXT: mov w14, v3.s[3] -; NONEON-NOSVE-NEXT: fmov s0, w12 -; NONEON-NOSVE-NEXT: mov v0.s[1], w11 -; NONEON-NOSVE-NEXT: udiv w14, w15, w14 -; NONEON-NOSVE-NEXT: mov w15, v1.s[3] -; NONEON-NOSVE-NEXT: fmov s1, w9 -; NONEON-NOSVE-NEXT: mov v0.s[2], w13 -; NONEON-NOSVE-NEXT: mov v1.s[1], w8 -; NONEON-NOSVE-NEXT: mov v1.s[2], w10 -; NONEON-NOSVE-NEXT: udiv w8, w16, w15 -; NONEON-NOSVE-NEXT: mov v0.s[3], w14 -; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -1787,12 +1778,14 @@ define <1 x i64> @udiv_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d1 ; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: udiv x8, x9, x8 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = udiv <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -1810,14 +1803,16 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: udiv_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x10, v0.d[1] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: udiv x8, x9, x8 -; NONEON-NOSVE-NEXT: mov x9, v1.d[1] -; NONEON-NOSVE-NEXT: udiv x9, x10, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = udiv <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -1837,25 +1832,27 @@ define void @udiv_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: udiv_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x10, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x11, d2 -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: mov x12, v0.d[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: udiv x8, x9, x8 -; NONEON-NOSVE-NEXT: mov x9, v3.d[1] -; NONEON-NOSVE-NEXT: udiv x9, x10, x9 -; NONEON-NOSVE-NEXT: fmov x10, d3 -; NONEON-NOSVE-NEXT: udiv x10, x11, x10 -; NONEON-NOSVE-NEXT: mov x11, v1.d[1] -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: udiv x11, x12, x11 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -1905,23 +1902,66 @@ define void @udiv_constantsplat_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: udiv_constantsplat_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #8969 // =0x2309 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] ; NONEON-NOSVE-NEXT: movk w8, #22765, lsl #16 -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: umull2 v3.2d, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umull v4.2d, v1.2s, v0.2s -; NONEON-NOSVE-NEXT: umull2 v5.2d, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: umull v0.2d, v2.2s, v0.2s -; NONEON-NOSVE-NEXT: uzp2 v3.4s, v4.4s, v3.4s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v5.4s -; NONEON-NOSVE-NEXT: sub v1.4s, v1.4s, v3.4s -; NONEON-NOSVE-NEXT: sub v2.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: usra v3.4s, v1.4s, #1 -; NONEON-NOSVE-NEXT: usra v0.4s, v2.4s, #1 -; NONEON-NOSVE-NEXT: ushr v1.4s, v3.4s, #6 -; NONEON-NOSVE-NEXT: ushr v0.4s, v0.4s, #6 -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #28] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w11, w9, #6 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w9, w9, #6 +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w11, w9, #6 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w9, w9, #6 +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w11, w9, #6 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w9, w9, #6 +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] +; NONEON-NOSVE-NEXT: umull x10, w9, w8 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w10 +; NONEON-NOSVE-NEXT: add w9, w10, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w11, w9, #6 +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: umull x8, w9, w8 +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: sub w9, w9, w8 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #1 +; NONEON-NOSVE-NEXT: lsr w8, w8, #6 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = udiv <8 x i32> %op1, diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll index 9f8511b00c6ed1..2c2b79121ef820 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll @@ -30,18 +30,50 @@ define void @sext_v8i1_v8i32(<8 x i1> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v8i1_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: shl v0.4s, v0.4s, #31 -; NONEON-NOSVE-NEXT: shl v1.4s, v1.4s, #31 -; NONEON-NOSVE-NEXT: cmlt v0.4s, v0.4s, #0 -; NONEON-NOSVE-NEXT: cmlt v1.4s, v1.4s, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #34] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #38] +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: stp w10, w8, [sp, #72] +; NONEON-NOSVE-NEXT: sbfx w8, w14, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w15, #0, #1 +; NONEON-NOSVE-NEXT: stp w8, w12, [sp, #64] +; NONEON-NOSVE-NEXT: sbfx w12, w13, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: stp w12, w10, [sp, #56] +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %b = sext <8 x i1> %a to <8 x i32> store <8 x i32> %b, ptr %out @@ -73,17 +105,21 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) { ; NONEON-NOSVE-LABEL: sext_v4i3_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: shl v0.2d, v0.2d, #61 -; NONEON-NOSVE-NEXT: shl v1.2d, v1.2d, #61 -; NONEON-NOSVE-NEXT: sshr v0.2d, v0.2d, #61 -; NONEON-NOSVE-NEXT: sshr v1.2d, v1.2d, #61 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp, #16] +; NONEON-NOSVE-NEXT: sbfx x8, x8, #0, #3 +; NONEON-NOSVE-NEXT: sbfx x9, x9, #0, #3 +; NONEON-NOSVE-NEXT: sbfx x10, x10, #0, #3 +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #48] +; NONEON-NOSVE-NEXT: sbfx x8, x11, #0, #3 +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = sext <4 x i3> %a to <4 x i64> store <4 x i64> %b, ptr %out @@ -106,13 +142,45 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v16i8_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = sext <16 x i8> %a to <16 x i16> store <16 x i16>%b, ptr %out @@ -138,20 +206,206 @@ define void @sext_v32i8_v32i16(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v32i8_v32i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v2.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: sshll v3.8h, v3.8b, #0 -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #272 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #208] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #224] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #240] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #256] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 272 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #31] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #23] +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: add w18, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #27] +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #21] +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: add w17, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: strb w17, [sp, #63] +; NONEON-NOSVE-NEXT: add w17, w30, w30 +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #37] +; NONEON-NOSVE-NEXT: strb w18, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #33] +; NONEON-NOSVE-NEXT: strb w17, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #45] +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #95] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #94] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #93] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #92] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #91] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #90] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #256] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #89] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #240] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #126] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #86] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #208] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #122] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #83] +; NONEON-NOSVE-NEXT: strh w8, [sp, #118] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #82] +; NONEON-NOSVE-NEXT: strh w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #81] +; NONEON-NOSVE-NEXT: strh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #111] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #110] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #109] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #108] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #107] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #106] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #105] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #104] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #103] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #102] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #101] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #100] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #99] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #97] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #96] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #144] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #272 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a @@ -177,14 +431,42 @@ define void @sext_v8i8_v8i32(<8 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v8i8_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #44] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #36] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %b = sext <8 x i8> %a to <8 x i32> store <8 x i32>%b, ptr %out @@ -210,21 +492,75 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v16i8_v16i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp, #-160]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #94] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %b = sext <16 x i8> %a to <16 x i32> store <16 x i32> %b, ptr %out @@ -263,36 +599,280 @@ define void @sext_v32i8_v32i32(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v32i8_v32i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: sshll v2.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: sshll v3.8h, v3.8b, #0 -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #72] -; NONEON-NOSVE-NEXT: sshll v5.4s, v5.4h, #0 -; NONEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: sshll v0.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v2.4s, v6.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: sshll v1.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v7.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: sub sp, sp, #464 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #384] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #400] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #416] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #432] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #448] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 464 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #31] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #23] +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: add w18, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #27] +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #21] +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: add w17, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: strb w17, [sp, #63] +; NONEON-NOSVE-NEXT: add w17, w30, w30 +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #37] +; NONEON-NOSVE-NEXT: strb w18, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #33] +; NONEON-NOSVE-NEXT: strb w17, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #45] +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #91] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #90] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #89] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #88] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #95] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #94] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #448] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #93] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #432] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #416] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #118] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #82] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #128] +; NONEON-NOSVE-NEXT: strh w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #400] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #192] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #384] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #126] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #86] +; NONEON-NOSVE-NEXT: strh w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #85] +; NONEON-NOSVE-NEXT: strh w8, [sp, #122] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #84] +; NONEON-NOSVE-NEXT: strh w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #107] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #106] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #105] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #176] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #182] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #111] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #110] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #109] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #108] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #99] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #160] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #97] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #96] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #103] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #102] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #101] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #100] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #198] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #144] +; NONEON-NOSVE-NEXT: str w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #196] +; NONEON-NOSVE-NEXT: str w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #194] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #192] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #206] +; NONEON-NOSVE-NEXT: str w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #204] +; NONEON-NOSVE-NEXT: str w8, [sp, #296] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #202] +; NONEON-NOSVE-NEXT: str w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #200] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #180] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #272] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #248] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #178] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #176] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #240] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #190] +; NONEON-NOSVE-NEXT: str w8, [sp, #268] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #188] +; NONEON-NOSVE-NEXT: str w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #186] +; NONEON-NOSVE-NEXT: str w8, [sp, #260] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #184] +; NONEON-NOSVE-NEXT: str w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #230] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #240] +; NONEON-NOSVE-NEXT: str w8, [sp, #348] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #228] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #226] +; NONEON-NOSVE-NEXT: str w8, [sp, #340] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #224] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #238] +; NONEON-NOSVE-NEXT: str w8, [sp, #364] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #236] +; NONEON-NOSVE-NEXT: str w8, [sp, #360] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #234] +; NONEON-NOSVE-NEXT: str w8, [sp, #356] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #232] +; NONEON-NOSVE-NEXT: str w8, [sp, #352] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #214] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #336] +; NONEON-NOSVE-NEXT: str w8, [sp, #316] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #212] +; NONEON-NOSVE-NEXT: str w8, [sp, #312] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #210] +; NONEON-NOSVE-NEXT: str w8, [sp, #308] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #222] +; NONEON-NOSVE-NEXT: str w8, [sp, #332] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #220] +; NONEON-NOSVE-NEXT: str w8, [sp, #328] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #218] +; NONEON-NOSVE-NEXT: str w8, [sp, #324] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #216] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #304] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #464 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a @@ -325,17 +905,19 @@ define void @sext_v4i8_v4i64(<4 x i8> %a, ptr %out) { ; NONEON-NOSVE-LABEL: sext_v4i8_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: shl v0.2d, v0.2d, #56 -; NONEON-NOSVE-NEXT: shl v1.2d, v1.2d, #56 -; NONEON-NOSVE-NEXT: sshr v0.2d, v0.2d, #56 -; NONEON-NOSVE-NEXT: sshr v1.2d, v1.2d, #56 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb x8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb x9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb x10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb x11, [sp, #20] +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #48] +; NONEON-NOSVE-NEXT: stp x10, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = sext <4 x i8> %a to <4 x i64> store <4 x i64>%b, ptr %out @@ -362,22 +944,57 @@ define void @sext_v8i8_v8i64(<8 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v8i8_v8i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: sub sp, sp, #176 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 176 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: add x8, sp, #144 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w9, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #64] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #64] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #48] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #80] +; NONEON-NOSVE-NEXT: ldpsw x9, x10, [sp, #96] +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #144] +; NONEON-NOSVE-NEXT: ldpsw x9, x10, [sp, #104] +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #160] +; NONEON-NOSVE-NEXT: ldpsw x9, x10, [sp, #80] +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #112] +; NONEON-NOSVE-NEXT: ldpsw x9, x10, [sp, #88] +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #112] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x8] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #176 ; NONEON-NOSVE-NEXT: ret %b = sext <8 x i8> %a to <8 x i64> store <8 x i64>%b, ptr %out @@ -419,37 +1036,109 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v16i8_v16i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-112]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 112 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #40] -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #48] -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #80] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #72] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #104] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #56] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #88] -; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q5, [x0, #64] -; NONEON-NOSVE-NEXT: sshll v1.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v2.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q4, [x0] -; NONEON-NOSVE-NEXT: sshll v0.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q2, [x0, #96] -; NONEON-NOSVE-NEXT: stp q0, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #112 +; NONEON-NOSVE-NEXT: sub sp, sp, #368 +; NONEON-NOSVE-NEXT: str x29, [sp, #352] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 368 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #352] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #102] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #100] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #152] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #92] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #80] +; NONEON-NOSVE-NEXT: str d0, [sp, #360] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #72] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #76] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #192] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp, #104] +; NONEON-NOSVE-NEXT: str d2, [sp, #168] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #176] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #216] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #320] +; NONEON-NOSVE-NEXT: ldrsw x9, [sp, #364] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #360] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #336] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #200] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #320] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #288] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #208] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #304] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #184] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #288] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #256] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #192] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #272] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #168] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #256] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #224] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #176] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #240] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #224] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q3, q4, [x0, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x0, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x0, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #368 ; NONEON-NOSVE-NEXT: ret %b = sext <16 x i8> %a to <16 x i64> store <16 x i64> %b, ptr %out @@ -522,69 +1211,367 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v32i8_v32i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #224 -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 224 -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [sp] -; NONEON-NOSVE-NEXT: sshll v5.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: sshll v6.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v3.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v4.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: stp q3, q5, [sp, #32] -; NONEON-NOSVE-NEXT: sshll v5.4s, v5.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #56] -; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] -; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #64] -; NONEON-NOSVE-NEXT: sshll v6.4s, v6.4h, #0 -; NONEON-NOSVE-NEXT: sshll v4.4s, v4.4h, #0 -; NONEON-NOSVE-NEXT: ldr d7, [sp, #88] -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #72] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v7.4s, v7.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q5, [sp, #128] -; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ldr d19, [sp, #152] -; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #96] -; NONEON-NOSVE-NEXT: ldr d20, [sp, #136] -; NONEON-NOSVE-NEXT: stp q1, q4, [sp, #160] -; NONEON-NOSVE-NEXT: ldr d17, [sp, #104] -; NONEON-NOSVE-NEXT: ldr d21, [sp, #120] -; NONEON-NOSVE-NEXT: stp q7, q6, [sp, #192] -; NONEON-NOSVE-NEXT: sshll v6.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: sshll v19.2d, v19.2s, #0 -; NONEON-NOSVE-NEXT: ldr d16, [sp, #216] -; NONEON-NOSVE-NEXT: ldr d22, [sp, #200] -; NONEON-NOSVE-NEXT: ldr d23, [sp, #184] -; NONEON-NOSVE-NEXT: ldr d18, [sp, #168] -; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: sshll v16.2d, v16.2s, #0 -; NONEON-NOSVE-NEXT: stp q5, q19, [x1] -; NONEON-NOSVE-NEXT: sshll v5.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: sshll v7.2d, v22.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: stp q6, q16, [x1, #128] -; NONEON-NOSVE-NEXT: sshll v6.2d, v23.2s, #0 -; NONEON-NOSVE-NEXT: stp q5, q7, [x1, #160] -; NONEON-NOSVE-NEXT: sshll v5.2d, v20.2s, #0 -; NONEON-NOSVE-NEXT: stp q4, q6, [x1, #192] -; NONEON-NOSVE-NEXT: sshll v4.2d, v21.2s, #0 -; NONEON-NOSVE-NEXT: stp q2, q5, [x1, #32] -; NONEON-NOSVE-NEXT: sshll v2.2d, v17.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #64] -; NONEON-NOSVE-NEXT: sshll v3.2d, v18.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #96] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #224] -; NONEON-NOSVE-NEXT: add sp, sp, #224 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #752 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 848 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #31] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #23] +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: add w18, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #27] +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #21] +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: add w17, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: strb w17, [sp, #63] +; NONEON-NOSVE-NEXT: add w17, w30, w30 +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #37] +; NONEON-NOSVE-NEXT: strb w18, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #33] +; NONEON-NOSVE-NEXT: strb w17, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #45] +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #91] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #90] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #89] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #88] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #95] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #94] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #93] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #83] +; NONEON-NOSVE-NEXT: strh w8, [sp, #118] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #82] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #128] +; NONEON-NOSVE-NEXT: strh w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #81] +; NONEON-NOSVE-NEXT: strh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #192] +; NONEON-NOSVE-NEXT: strh w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #87] +; NONEON-NOSVE-NEXT: strh w8, [sp, #126] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #86] +; NONEON-NOSVE-NEXT: strh w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #85] +; NONEON-NOSVE-NEXT: strh w8, [sp, #122] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #84] +; NONEON-NOSVE-NEXT: strh w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #107] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #106] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #105] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #176] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #178] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #111] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #110] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #109] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #108] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #99] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #160] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #97] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #96] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #103] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #102] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #101] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #100] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #194] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #144] +; NONEON-NOSVE-NEXT: str w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #192] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #198] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #196] +; NONEON-NOSVE-NEXT: str w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #202] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #272] +; NONEON-NOSVE-NEXT: str w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #200] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #206] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #400] +; NONEON-NOSVE-NEXT: str w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #204] +; NONEON-NOSVE-NEXT: str w8, [sp, #296] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #176] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #288] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #240] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #182] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #180] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #248] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #186] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #416] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #240] +; NONEON-NOSVE-NEXT: str w8, [sp, #260] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #184] +; NONEON-NOSVE-NEXT: str w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #190] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #368] +; NONEON-NOSVE-NEXT: str w8, [sp, #268] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #188] +; NONEON-NOSVE-NEXT: ldrsw x9, [sp, #372] +; NONEON-NOSVE-NEXT: str w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #226] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #256] +; NONEON-NOSVE-NEXT: str w8, [sp, #340] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #224] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #230] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #384] +; NONEON-NOSVE-NEXT: str w8, [sp, #348] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #228] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #234] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #336] +; NONEON-NOSVE-NEXT: str w8, [sp, #356] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #232] +; NONEON-NOSVE-NEXT: str w8, [sp, #352] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #238] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #464] +; NONEON-NOSVE-NEXT: str w8, [sp, #364] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #236] +; NONEON-NOSVE-NEXT: str w8, [sp, #360] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #210] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #352] +; NONEON-NOSVE-NEXT: str w8, [sp, #308] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #214] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #480] +; NONEON-NOSVE-NEXT: str w8, [sp, #316] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #212] +; NONEON-NOSVE-NEXT: str w8, [sp, #312] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #218] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #304] +; NONEON-NOSVE-NEXT: str w8, [sp, #324] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #216] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #222] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #432] +; NONEON-NOSVE-NEXT: str w8, [sp, #332] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #220] +; NONEON-NOSVE-NEXT: str w8, [sp, #328] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #404] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #320] +; NONEON-NOSVE-NEXT: str x8, [sp, #568] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #400] +; NONEON-NOSVE-NEXT: str x8, [sp, #560] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #412] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #448] +; NONEON-NOSVE-NEXT: str x8, [sp, #584] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #408] +; NONEON-NOSVE-NEXT: str x8, [sp, #576] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #420] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #560] +; NONEON-NOSVE-NEXT: str x8, [sp, #600] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #416] +; NONEON-NOSVE-NEXT: str x8, [sp, #592] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #428] +; NONEON-NOSVE-NEXT: str x8, [sp, #616] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #424] +; NONEON-NOSVE-NEXT: str x8, [sp, #608] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #368] +; NONEON-NOSVE-NEXT: ldp q2, q3, [sp, #592] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #496] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #380] +; NONEON-NOSVE-NEXT: str x8, [sp, #520] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #376] +; NONEON-NOSVE-NEXT: str x8, [sp, #512] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #388] +; NONEON-NOSVE-NEXT: ldp q4, q5, [sp, #496] +; NONEON-NOSVE-NEXT: str x8, [sp, #536] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #384] +; NONEON-NOSVE-NEXT: str x8, [sp, #528] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #396] +; NONEON-NOSVE-NEXT: str x8, [sp, #552] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #392] +; NONEON-NOSVE-NEXT: str x8, [sp, #544] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #468] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #528] +; NONEON-NOSVE-NEXT: str x8, [sp, #696] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #464] +; NONEON-NOSVE-NEXT: str x8, [sp, #688] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #476] +; NONEON-NOSVE-NEXT: str x8, [sp, #712] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #472] +; NONEON-NOSVE-NEXT: str x8, [sp, #704] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #484] +; NONEON-NOSVE-NEXT: ldp q16, q17, [sp, #688] +; NONEON-NOSVE-NEXT: str x8, [sp, #728] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #480] +; NONEON-NOSVE-NEXT: str x8, [sp, #720] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #492] +; NONEON-NOSVE-NEXT: str x8, [sp, #744] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #488] +; NONEON-NOSVE-NEXT: str x8, [sp, #736] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #436] +; NONEON-NOSVE-NEXT: ldp q19, q20, [sp, #720] +; NONEON-NOSVE-NEXT: str x8, [sp, #632] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #432] +; NONEON-NOSVE-NEXT: str x8, [sp, #624] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #444] +; NONEON-NOSVE-NEXT: str x8, [sp, #648] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #440] +; NONEON-NOSVE-NEXT: str x8, [sp, #640] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #452] +; NONEON-NOSVE-NEXT: ldp q22, q23, [sp, #624] +; NONEON-NOSVE-NEXT: str x8, [sp, #664] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #448] +; NONEON-NOSVE-NEXT: str x8, [sp, #656] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #460] +; NONEON-NOSVE-NEXT: str x8, [sp, #680] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #456] +; NONEON-NOSVE-NEXT: str x8, [sp, #672] +; NONEON-NOSVE-NEXT: ldp q21, q18, [sp, #656] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1, #32] +; NONEON-NOSVE-NEXT: stp q4, q5, [x1, #64] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #96] +; NONEON-NOSVE-NEXT: stp q16, q17, [x1, #128] +; NONEON-NOSVE-NEXT: stp q19, q20, [x1, #160] +; NONEON-NOSVE-NEXT: stp q22, q23, [x1, #192] +; NONEON-NOSVE-NEXT: stp q21, q18, [x1, #224] +; NONEON-NOSVE-NEXT: add sp, sp, #752 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a @@ -609,13 +1596,25 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v8i16_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = sext <8 x i16> %a to <8 x i32> store <8 x i32>%b, ptr %out @@ -640,20 +1639,91 @@ define void @sext_v16i16_v16i32(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v16i16_v16i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w5, [sp] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #14] +; NONEON-NOSVE-NEXT: add w13, w13, w13 +; NONEON-NOSVE-NEXT: add w14, w14, w14 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w14, [sp, #46] +; NONEON-NOSVE-NEXT: add w14, w3, w3 +; NONEON-NOSVE-NEXT: strh w13, [sp, #44] +; NONEON-NOSVE-NEXT: add w13, w5, w5 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w14, [sp, #42] +; NONEON-NOSVE-NEXT: add w14, w4, w4 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w13, [sp, #40] +; NONEON-NOSVE-NEXT: add w13, w2, w2 +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #22] +; NONEON-NOSVE-NEXT: strh w14, [sp, #38] +; NONEON-NOSVE-NEXT: add w14, w0, w0 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w13, [sp, #36] +; NONEON-NOSVE-NEXT: add w13, w18, w18 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w14, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #30] +; NONEON-NOSVE-NEXT: strh w13, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #18] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #20] +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: add w14, w17, w17 +; NONEON-NOSVE-NEXT: add w12, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w13, w16, w16 +; NONEON-NOSVE-NEXT: add w11, w11, w11 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: add w10, w10, w10 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #76] +; NONEON-NOSVE-NEXT: strh w14, [sp, #62] +; NONEON-NOSVE-NEXT: add w14, w15, w15 +; NONEON-NOSVE-NEXT: strh w13, [sp, #60] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #72] +; NONEON-NOSVE-NEXT: strh w14, [sp, #58] +; NONEON-NOSVE-NEXT: strh w12, [sp, #56] +; NONEON-NOSVE-NEXT: strh w11, [sp, #54] +; NONEON-NOSVE-NEXT: strh w10, [sp, #52] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #96] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = add <16 x i16> %a, %a @@ -679,14 +1749,24 @@ define void @sext_v4i16_v4i64(<4 x i16> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v4i16_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #40] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %b = sext <4 x i16> %a to <4 x i64> store <4 x i64>%b, ptr %out @@ -712,21 +1792,39 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v8i16_v8i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp, #-160]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #88] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #144] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #128] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #72] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #112] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %b = sext <8 x i16> %a to <8 x i64> store <8 x i64>%b, ptr %out @@ -765,36 +1863,124 @@ define void @sext_v16i16_v16i64(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v16i16_v16i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #72] -; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: sshll v0.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v2.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: sshll v1.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: sub sp, sp, #368 +; NONEON-NOSVE-NEXT: str x29, [sp, #352] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 368 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #352] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w5, [sp] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #14] +; NONEON-NOSVE-NEXT: add w13, w13, w13 +; NONEON-NOSVE-NEXT: add w14, w14, w14 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w14, [sp, #54] +; NONEON-NOSVE-NEXT: add w14, w3, w3 +; NONEON-NOSVE-NEXT: strh w13, [sp, #52] +; NONEON-NOSVE-NEXT: add w13, w5, w5 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w14, [sp, #50] +; NONEON-NOSVE-NEXT: add w14, w4, w4 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w13, [sp, #48] +; NONEON-NOSVE-NEXT: add w13, w2, w2 +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #22] +; NONEON-NOSVE-NEXT: strh w14, [sp, #46] +; NONEON-NOSVE-NEXT: add w14, w0, w0 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w13, [sp, #44] +; NONEON-NOSVE-NEXT: add w13, w18, w18 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w14, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #30] +; NONEON-NOSVE-NEXT: strh w13, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #18] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #20] +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: add w14, w17, w17 +; NONEON-NOSVE-NEXT: add w12, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w13, w16, w16 +; NONEON-NOSVE-NEXT: add w11, w11, w11 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: add w10, w10, w10 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #80] +; NONEON-NOSVE-NEXT: strh w14, [sp, #70] +; NONEON-NOSVE-NEXT: add w14, w15, w15 +; NONEON-NOSVE-NEXT: strh w13, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #84] +; NONEON-NOSVE-NEXT: strh w14, [sp, #66] +; NONEON-NOSVE-NEXT: strh w12, [sp, #64] +; NONEON-NOSVE-NEXT: strh w11, [sp, #62] +; NONEON-NOSVE-NEXT: strh w10, [sp, #60] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #76] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #98] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #96] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #102] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #100] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #184] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #104] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #92] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #168] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #152] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: str d0, [sp, #360] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp, #136] +; NONEON-NOSVE-NEXT: str d2, [sp, #200] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #184] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #256] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #192] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #272] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #168] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #256] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #224] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #176] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #240] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #216] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #224] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #320] +; NONEON-NOSVE-NEXT: ldrsw x9, [sp, #364] +; NONEON-NOSVE-NEXT: ldrsw x8, [sp, #360] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #336] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #200] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #320] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #288] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #208] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #304] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #288] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #368 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = add <16 x i16> %a, %a @@ -819,13 +2005,17 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v4i32_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #24] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = sext <4 x i32> %a to <4 x i64> store <4 x i64>%b, ptr %out @@ -850,20 +2040,43 @@ define void @sext_v8i32_v8i64(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: sext_v8i32_v8i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #16] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #72] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #112] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #64] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #96] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #88] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #96] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #144] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #80] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i32>, ptr %in %b = add <8 x i32> %a, %a @@ -888,13 +2101,45 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v16i8_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = zext <16 x i8> %a to <16 x i16> store <16 x i16>%b, ptr %out @@ -920,20 +2165,206 @@ define void @zext_v32i8_v32i16(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v32i8_v32i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: ushll v3.8h, v3.8b, #0 -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #272 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #208] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #224] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #240] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #256] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 272 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #31] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #23] +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: add w18, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #27] +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #21] +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: add w17, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: strb w17, [sp, #63] +; NONEON-NOSVE-NEXT: add w17, w30, w30 +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #37] +; NONEON-NOSVE-NEXT: strb w18, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #33] +; NONEON-NOSVE-NEXT: strb w17, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #45] +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #95] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #94] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #93] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #92] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #91] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #90] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #256] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #89] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #240] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #126] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #86] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #208] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #122] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #83] +; NONEON-NOSVE-NEXT: strh w8, [sp, #118] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #82] +; NONEON-NOSVE-NEXT: strh w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #81] +; NONEON-NOSVE-NEXT: strh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #111] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #110] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #109] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #108] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #107] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #106] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #105] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #104] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #103] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #102] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #101] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #100] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #99] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #97] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #96] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #144] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #272 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a @@ -959,14 +2390,42 @@ define void @zext_v8i8_v8i32(<8 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v8i8_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %b = zext <8 x i8> %a to <8 x i32> store <8 x i32>%b, ptr %out @@ -992,21 +2451,75 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v16i8_v16i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp, #-160]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #94] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %b = zext <16 x i8> %a to <16 x i32> store <16 x i32> %b, ptr %out @@ -1045,36 +2558,280 @@ define void @zext_v32i8_v32i32(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v32i8_v32i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: ushll v3.8h, v3.8b, #0 -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #72] -; NONEON-NOSVE-NEXT: ushll v5.4s, v5.4h, #0 -; NONEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: ushll v0.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v2.4s, v6.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: ushll v1.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v7.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: sub sp, sp, #464 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #384] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #400] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #416] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #432] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #448] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 464 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #31] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #23] +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: add w18, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #27] +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #21] +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: add w17, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: strb w17, [sp, #63] +; NONEON-NOSVE-NEXT: add w17, w30, w30 +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #37] +; NONEON-NOSVE-NEXT: strb w18, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #33] +; NONEON-NOSVE-NEXT: strb w17, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #45] +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #91] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #90] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #89] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #88] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #95] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #94] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #448] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #93] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #432] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #416] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #118] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #82] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #128] +; NONEON-NOSVE-NEXT: strh w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #400] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #192] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #384] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #126] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #86] +; NONEON-NOSVE-NEXT: strh w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #85] +; NONEON-NOSVE-NEXT: strh w8, [sp, #122] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #84] +; NONEON-NOSVE-NEXT: strh w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #107] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #106] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #105] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #176] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #104] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #182] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #111] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #110] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #109] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #108] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #99] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #160] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #97] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #96] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #103] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #102] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #101] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #100] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #198] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #144] +; NONEON-NOSVE-NEXT: str w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #196] +; NONEON-NOSVE-NEXT: str w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #194] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #192] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #206] +; NONEON-NOSVE-NEXT: str w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #204] +; NONEON-NOSVE-NEXT: str w8, [sp, #296] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #202] +; NONEON-NOSVE-NEXT: str w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #200] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #180] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #272] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #248] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #178] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #176] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #240] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #190] +; NONEON-NOSVE-NEXT: str w8, [sp, #268] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #188] +; NONEON-NOSVE-NEXT: str w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #186] +; NONEON-NOSVE-NEXT: str w8, [sp, #260] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #184] +; NONEON-NOSVE-NEXT: str w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #230] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #240] +; NONEON-NOSVE-NEXT: str w8, [sp, #348] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #228] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #226] +; NONEON-NOSVE-NEXT: str w8, [sp, #340] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #224] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #238] +; NONEON-NOSVE-NEXT: str w8, [sp, #364] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #236] +; NONEON-NOSVE-NEXT: str w8, [sp, #360] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #234] +; NONEON-NOSVE-NEXT: str w8, [sp, #356] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #232] +; NONEON-NOSVE-NEXT: str w8, [sp, #352] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #214] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #336] +; NONEON-NOSVE-NEXT: str w8, [sp, #316] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #212] +; NONEON-NOSVE-NEXT: str w8, [sp, #312] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #210] +; NONEON-NOSVE-NEXT: str w8, [sp, #308] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #222] +; NONEON-NOSVE-NEXT: str w8, [sp, #332] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #220] +; NONEON-NOSVE-NEXT: str w8, [sp, #328] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #218] +; NONEON-NOSVE-NEXT: str w8, [sp, #324] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #216] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #304] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #464 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a @@ -1104,16 +2861,26 @@ define void @zext_v4i8_v4i64(<4 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v4i8_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #64] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %b = zext <4 x i8> %a to <4 x i64> store <4 x i64>%b, ptr %out @@ -1140,22 +2907,61 @@ define void @zext_v8i8_v8i64(<8 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v8i8_v8i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: sub sp, sp, #176 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 176 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: add x8, sp, #144 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w9, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #64] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #96] +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #48] +; NONEON-NOSVE-NEXT: stp w10, wzr, [sp, #152] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #144] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #104] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #80] +; NONEON-NOSVE-NEXT: stp w10, wzr, [sp, #168] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #160] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #80] +; NONEON-NOSVE-NEXT: stp w10, wzr, [sp, #120] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #88] +; NONEON-NOSVE-NEXT: stp w10, wzr, [sp, #136] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #112] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x8] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #176 ; NONEON-NOSVE-NEXT: ret %b = zext <8 x i8> %a to <8 x i64> store <8 x i64>%b, ptr %out @@ -1197,37 +3003,129 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v16i8_v16i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-112]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 112 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: ushll v1.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #40] -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #48] -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #80] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #72] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #104] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #56] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #88] -; NONEON-NOSVE-NEXT: ushll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: ushll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q5, [x0, #64] -; NONEON-NOSVE-NEXT: ushll v1.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v2.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q4, [x0] -; NONEON-NOSVE-NEXT: ushll v0.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q2, [x0, #96] -; NONEON-NOSVE-NEXT: stp q0, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #112 +; NONEON-NOSVE-NEXT: sub sp, sp, #368 +; NONEON-NOSVE-NEXT: str x29, [sp, #352] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 368 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #352] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: str wzr, [sp, #332] +; NONEON-NOSVE-NEXT: str wzr, [sp, #324] +; NONEON-NOSVE-NEXT: str wzr, [sp, #348] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: str wzr, [sp, #340] +; NONEON-NOSVE-NEXT: str wzr, [sp, #300] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: str wzr, [sp, #292] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: str wzr, [sp, #316] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: str wzr, [sp, #308] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: str wzr, [sp, #268] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: str wzr, [sp, #260] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: str wzr, [sp, #284] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: str wzr, [sp, #276] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #102] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #100] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #152] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #92] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #80] +; NONEON-NOSVE-NEXT: str d0, [sp, #360] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #72] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #76] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #216] +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #192] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp, #104] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #364] +; NONEON-NOSVE-NEXT: str w9, [sp, #328] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #360] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #176] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #200] +; NONEON-NOSVE-NEXT: str d2, [sp, #168] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #320] +; NONEON-NOSVE-NEXT: str w9, [sp, #296] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #208] +; NONEON-NOSVE-NEXT: str w9, [sp, #312] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #288] +; NONEON-NOSVE-NEXT: str w9, [sp, #264] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #252] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #192] +; NONEON-NOSVE-NEXT: str w9, [sp, #280] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #256] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #232] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #224] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: stp wzr, w9, [sp, #244] +; NONEON-NOSVE-NEXT: str w8, [sp, #240] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #224] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: stp q3, q4, [x0, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x0, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x0, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #368 ; NONEON-NOSVE-NEXT: ret %b = zext <16 x i8> %a to <16 x i64> store <16 x i64> %b, ptr %out @@ -1300,69 +3198,400 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v32i8_v32i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #224 -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 224 -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [sp] -; NONEON-NOSVE-NEXT: ushll v5.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: ushll v6.8h, v1.8b, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v3.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v4.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: stp q3, q5, [sp, #32] -; NONEON-NOSVE-NEXT: ushll v5.4s, v5.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #56] -; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] -; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #64] -; NONEON-NOSVE-NEXT: ushll v6.4s, v6.4h, #0 -; NONEON-NOSVE-NEXT: ushll v4.4s, v4.4h, #0 -; NONEON-NOSVE-NEXT: ldr d7, [sp, #88] -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #72] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v7.4s, v7.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q5, [sp, #128] -; NONEON-NOSVE-NEXT: ushll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ldr d19, [sp, #152] -; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #96] -; NONEON-NOSVE-NEXT: ldr d20, [sp, #136] -; NONEON-NOSVE-NEXT: stp q1, q4, [sp, #160] -; NONEON-NOSVE-NEXT: ldr d17, [sp, #104] -; NONEON-NOSVE-NEXT: ldr d21, [sp, #120] -; NONEON-NOSVE-NEXT: stp q7, q6, [sp, #192] -; NONEON-NOSVE-NEXT: ushll v6.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: ushll v19.2d, v19.2s, #0 -; NONEON-NOSVE-NEXT: ldr d16, [sp, #216] -; NONEON-NOSVE-NEXT: ldr d22, [sp, #200] -; NONEON-NOSVE-NEXT: ldr d23, [sp, #184] -; NONEON-NOSVE-NEXT: ldr d18, [sp, #168] -; NONEON-NOSVE-NEXT: ushll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ushll v16.2d, v16.2s, #0 -; NONEON-NOSVE-NEXT: stp q5, q19, [x1] -; NONEON-NOSVE-NEXT: ushll v5.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: ushll v7.2d, v22.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: stp q6, q16, [x1, #128] -; NONEON-NOSVE-NEXT: ushll v6.2d, v23.2s, #0 -; NONEON-NOSVE-NEXT: stp q5, q7, [x1, #160] -; NONEON-NOSVE-NEXT: ushll v5.2d, v20.2s, #0 -; NONEON-NOSVE-NEXT: stp q4, q6, [x1, #192] -; NONEON-NOSVE-NEXT: ushll v4.2d, v21.2s, #0 -; NONEON-NOSVE-NEXT: stp q2, q5, [x1, #32] -; NONEON-NOSVE-NEXT: ushll v2.2d, v17.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #64] -; NONEON-NOSVE-NEXT: ushll v3.2d, v18.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #96] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #224] -; NONEON-NOSVE-NEXT: add sp, sp, #224 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #752 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 848 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str wzr, [sp, #572] +; NONEON-NOSVE-NEXT: str wzr, [sp, #564] +; NONEON-NOSVE-NEXT: str wzr, [sp, #588] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #22] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: add w18, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #31] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #29] +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #23] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #27] +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #21] +; NONEON-NOSVE-NEXT: add w17, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #37] +; NONEON-NOSVE-NEXT: strb w17, [sp, #63] +; NONEON-NOSVE-NEXT: add w17, w30, w30 +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #35] +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #47] +; NONEON-NOSVE-NEXT: strb w18, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: strb w17, [sp, #61] +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #91] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #90] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #89] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #88] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #95] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #94] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str wzr, [sp, #580] +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #93] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: str wzr, [sp, #604] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #83] +; NONEON-NOSVE-NEXT: str wzr, [sp, #596] +; NONEON-NOSVE-NEXT: strh w8, [sp, #118] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #82] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #128] +; NONEON-NOSVE-NEXT: strh w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #81] +; NONEON-NOSVE-NEXT: str wzr, [sp, #620] +; NONEON-NOSVE-NEXT: strh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #192] +; NONEON-NOSVE-NEXT: strh w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #87] +; NONEON-NOSVE-NEXT: str wzr, [sp, #612] +; NONEON-NOSVE-NEXT: strh w8, [sp, #126] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #86] +; NONEON-NOSVE-NEXT: str wzr, [sp, #508] +; NONEON-NOSVE-NEXT: strh w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #85] +; NONEON-NOSVE-NEXT: str wzr, [sp, #500] +; NONEON-NOSVE-NEXT: strh w8, [sp, #122] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #84] +; NONEON-NOSVE-NEXT: str wzr, [sp, #524] +; NONEON-NOSVE-NEXT: strh w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #107] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #112] +; NONEON-NOSVE-NEXT: str wzr, [sp, #516] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #106] +; NONEON-NOSVE-NEXT: str wzr, [sp, #540] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #105] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #176] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #104] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #178] +; NONEON-NOSVE-NEXT: str wzr, [sp, #532] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #111] +; NONEON-NOSVE-NEXT: str wzr, [sp, #556] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #110] +; NONEON-NOSVE-NEXT: str wzr, [sp, #548] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #109] +; NONEON-NOSVE-NEXT: str wzr, [sp, #700] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #108] +; NONEON-NOSVE-NEXT: str wzr, [sp, #692] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #99] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #160] +; NONEON-NOSVE-NEXT: str wzr, [sp, #716] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #98] +; NONEON-NOSVE-NEXT: str wzr, [sp, #708] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #97] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #224] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #96] +; NONEON-NOSVE-NEXT: str wzr, [sp, #732] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #103] +; NONEON-NOSVE-NEXT: str wzr, [sp, #724] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #102] +; NONEON-NOSVE-NEXT: str wzr, [sp, #748] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #101] +; NONEON-NOSVE-NEXT: str wzr, [sp, #740] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #100] +; NONEON-NOSVE-NEXT: str wzr, [sp, #636] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #194] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #144] +; NONEON-NOSVE-NEXT: str wzr, [sp, #628] +; NONEON-NOSVE-NEXT: str w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #192] +; NONEON-NOSVE-NEXT: str wzr, [sp, #652] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #198] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #196] +; NONEON-NOSVE-NEXT: str wzr, [sp, #644] +; NONEON-NOSVE-NEXT: str w8, [sp, #280] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #202] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #272] +; NONEON-NOSVE-NEXT: str wzr, [sp, #668] +; NONEON-NOSVE-NEXT: str w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #200] +; NONEON-NOSVE-NEXT: str wzr, [sp, #660] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #206] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #400] +; NONEON-NOSVE-NEXT: str w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #204] +; NONEON-NOSVE-NEXT: str wzr, [sp, #684] +; NONEON-NOSVE-NEXT: str w8, [sp, #296] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #176] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #288] +; NONEON-NOSVE-NEXT: str wzr, [sp, #676] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #240] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #182] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #180] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #248] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #186] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #416] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #240] +; NONEON-NOSVE-NEXT: str w8, [sp, #260] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #184] +; NONEON-NOSVE-NEXT: str w8, [sp, #256] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #190] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #368] +; NONEON-NOSVE-NEXT: str w8, [sp, #268] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #188] +; NONEON-NOSVE-NEXT: str w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #226] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #256] +; NONEON-NOSVE-NEXT: str w8, [sp, #340] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #224] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #230] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #384] +; NONEON-NOSVE-NEXT: str w8, [sp, #348] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #228] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #234] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #336] +; NONEON-NOSVE-NEXT: str w8, [sp, #356] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #232] +; NONEON-NOSVE-NEXT: str w8, [sp, #352] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #238] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #464] +; NONEON-NOSVE-NEXT: str w8, [sp, #364] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #236] +; NONEON-NOSVE-NEXT: str w8, [sp, #360] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #210] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #352] +; NONEON-NOSVE-NEXT: str w8, [sp, #308] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #208] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #214] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #480] +; NONEON-NOSVE-NEXT: str w8, [sp, #316] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #212] +; NONEON-NOSVE-NEXT: str w8, [sp, #312] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #218] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #304] +; NONEON-NOSVE-NEXT: str w8, [sp, #324] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #216] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #222] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #432] +; NONEON-NOSVE-NEXT: str w8, [sp, #332] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #220] +; NONEON-NOSVE-NEXT: str w8, [sp, #328] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #404] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #320] +; NONEON-NOSVE-NEXT: str w8, [sp, #568] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #400] +; NONEON-NOSVE-NEXT: str w8, [sp, #560] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #412] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #448] +; NONEON-NOSVE-NEXT: str w8, [sp, #584] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #408] +; NONEON-NOSVE-NEXT: str w8, [sp, #576] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #420] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #560] +; NONEON-NOSVE-NEXT: str w8, [sp, #600] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #416] +; NONEON-NOSVE-NEXT: str w8, [sp, #592] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #428] +; NONEON-NOSVE-NEXT: str w8, [sp, #616] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #424] +; NONEON-NOSVE-NEXT: str w8, [sp, #608] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #372] +; NONEON-NOSVE-NEXT: ldp q2, q3, [sp, #592] +; NONEON-NOSVE-NEXT: str w8, [sp, #504] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #368] +; NONEON-NOSVE-NEXT: str w8, [sp, #496] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #380] +; NONEON-NOSVE-NEXT: str w8, [sp, #520] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #376] +; NONEON-NOSVE-NEXT: str w8, [sp, #512] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #388] +; NONEON-NOSVE-NEXT: ldp q4, q5, [sp, #496] +; NONEON-NOSVE-NEXT: str w8, [sp, #536] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #384] +; NONEON-NOSVE-NEXT: str w8, [sp, #528] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #396] +; NONEON-NOSVE-NEXT: str w8, [sp, #552] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #392] +; NONEON-NOSVE-NEXT: str w8, [sp, #544] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #468] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #528] +; NONEON-NOSVE-NEXT: str w8, [sp, #696] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #464] +; NONEON-NOSVE-NEXT: str w8, [sp, #688] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #476] +; NONEON-NOSVE-NEXT: str w8, [sp, #712] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #472] +; NONEON-NOSVE-NEXT: str w8, [sp, #704] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #484] +; NONEON-NOSVE-NEXT: ldp q16, q17, [sp, #688] +; NONEON-NOSVE-NEXT: str w8, [sp, #728] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #480] +; NONEON-NOSVE-NEXT: str w8, [sp, #720] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #492] +; NONEON-NOSVE-NEXT: str w8, [sp, #744] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #488] +; NONEON-NOSVE-NEXT: str w8, [sp, #736] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #436] +; NONEON-NOSVE-NEXT: ldp q19, q20, [sp, #720] +; NONEON-NOSVE-NEXT: str w8, [sp, #632] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #432] +; NONEON-NOSVE-NEXT: str w8, [sp, #624] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #444] +; NONEON-NOSVE-NEXT: str w8, [sp, #648] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #440] +; NONEON-NOSVE-NEXT: str w8, [sp, #640] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #452] +; NONEON-NOSVE-NEXT: ldp q22, q23, [sp, #624] +; NONEON-NOSVE-NEXT: str w8, [sp, #664] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #448] +; NONEON-NOSVE-NEXT: str w8, [sp, #656] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #460] +; NONEON-NOSVE-NEXT: str w8, [sp, #680] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #456] +; NONEON-NOSVE-NEXT: str w8, [sp, #672] +; NONEON-NOSVE-NEXT: ldp q21, q18, [sp, #656] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1, #32] +; NONEON-NOSVE-NEXT: stp q4, q5, [x1, #64] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #96] +; NONEON-NOSVE-NEXT: stp q16, q17, [x1, #128] +; NONEON-NOSVE-NEXT: stp q19, q20, [x1, #160] +; NONEON-NOSVE-NEXT: stp q22, q23, [x1, #192] +; NONEON-NOSVE-NEXT: stp q21, q18, [x1, #224] +; NONEON-NOSVE-NEXT: add sp, sp, #752 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: ret %a = load <32 x i8>, ptr %in %b = add <32 x i8> %a, %a @@ -1387,13 +3616,25 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v8i16_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = zext <8 x i16> %a to <8 x i32> store <8 x i32>%b, ptr %out @@ -1418,20 +3659,91 @@ define void @zext_v16i16_v16i32(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v16i16_v16i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w5, [sp] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #14] +; NONEON-NOSVE-NEXT: add w13, w13, w13 +; NONEON-NOSVE-NEXT: add w14, w14, w14 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w14, [sp, #46] +; NONEON-NOSVE-NEXT: add w14, w3, w3 +; NONEON-NOSVE-NEXT: strh w13, [sp, #44] +; NONEON-NOSVE-NEXT: add w13, w5, w5 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w14, [sp, #42] +; NONEON-NOSVE-NEXT: add w14, w4, w4 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w13, [sp, #40] +; NONEON-NOSVE-NEXT: add w13, w2, w2 +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #22] +; NONEON-NOSVE-NEXT: strh w14, [sp, #38] +; NONEON-NOSVE-NEXT: add w14, w0, w0 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w13, [sp, #36] +; NONEON-NOSVE-NEXT: add w13, w18, w18 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w14, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #30] +; NONEON-NOSVE-NEXT: strh w13, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #18] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #20] +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: add w14, w17, w17 +; NONEON-NOSVE-NEXT: add w12, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w13, w16, w16 +; NONEON-NOSVE-NEXT: add w11, w11, w11 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: add w10, w10, w10 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #76] +; NONEON-NOSVE-NEXT: strh w14, [sp, #62] +; NONEON-NOSVE-NEXT: add w14, w15, w15 +; NONEON-NOSVE-NEXT: strh w13, [sp, #60] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #72] +; NONEON-NOSVE-NEXT: strh w14, [sp, #58] +; NONEON-NOSVE-NEXT: strh w12, [sp, #56] +; NONEON-NOSVE-NEXT: strh w11, [sp, #54] +; NONEON-NOSVE-NEXT: strh w10, [sp, #52] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #96] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = add <16 x i16> %a, %a @@ -1457,14 +3769,26 @@ define void @zext_v4i16_v4i64(<4 x i16> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v4i16_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #64] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %b = zext <4 x i16> %a to <4 x i64> store <4 x i64>%b, ptr %out @@ -1490,21 +3814,43 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v8i16_v8i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp, #-160]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #152] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #144] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #128] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #104] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %b = zext <8 x i16> %a to <8 x i64> store <8 x i64>%b, ptr %out @@ -1543,36 +3889,144 @@ define void @zext_v16i16_v16i64(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v16i16_v16i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #72] -; NONEON-NOSVE-NEXT: ushll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: ushll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: ushll v0.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v2.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: ushll v1.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: sub sp, sp, #368 +; NONEON-NOSVE-NEXT: str x29, [sp, #352] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 368 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str wzr, [sp, #268] +; NONEON-NOSVE-NEXT: str wzr, [sp, #260] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #352] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: str wzr, [sp, #284] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w5, [sp] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #14] +; NONEON-NOSVE-NEXT: add w13, w13, w13 +; NONEON-NOSVE-NEXT: add w14, w14, w14 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w14, [sp, #54] +; NONEON-NOSVE-NEXT: add w14, w3, w3 +; NONEON-NOSVE-NEXT: strh w13, [sp, #52] +; NONEON-NOSVE-NEXT: add w13, w5, w5 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w14, [sp, #50] +; NONEON-NOSVE-NEXT: add w14, w4, w4 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w13, [sp, #48] +; NONEON-NOSVE-NEXT: add w13, w2, w2 +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #22] +; NONEON-NOSVE-NEXT: strh w14, [sp, #46] +; NONEON-NOSVE-NEXT: add w14, w0, w0 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w13, [sp, #44] +; NONEON-NOSVE-NEXT: add w13, w18, w18 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w14, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #30] +; NONEON-NOSVE-NEXT: strh w13, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #18] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: add w14, w17, w17 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #20] +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: add w12, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w11, w11, w11 +; NONEON-NOSVE-NEXT: add w10, w10, w10 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: add w13, w16, w16 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #80] +; NONEON-NOSVE-NEXT: strh w14, [sp, #70] +; NONEON-NOSVE-NEXT: add w14, w15, w15 +; NONEON-NOSVE-NEXT: strh w13, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #84] +; NONEON-NOSVE-NEXT: strh w14, [sp, #66] +; NONEON-NOSVE-NEXT: strh w12, [sp, #64] +; NONEON-NOSVE-NEXT: strh w11, [sp, #62] +; NONEON-NOSVE-NEXT: strh w10, [sp, #60] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #72] +; NONEON-NOSVE-NEXT: str wzr, [sp, #276] +; NONEON-NOSVE-NEXT: str wzr, [sp, #332] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #76] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #98] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #96] +; NONEON-NOSVE-NEXT: str wzr, [sp, #324] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #102] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #100] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #184] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #104] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #88] +; NONEON-NOSVE-NEXT: str wzr, [sp, #348] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #92] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #168] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #152] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: str d0, [sp, #360] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp, #136] +; NONEON-NOSVE-NEXT: str wzr, [sp, #340] +; NONEON-NOSVE-NEXT: str w9, [sp, #264] +; NONEON-NOSVE-NEXT: stp wzr, w8, [sp, #252] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #192] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: str d2, [sp, #200] +; NONEON-NOSVE-NEXT: str w9, [sp, #280] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: str wzr, [sp, #300] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #256] +; NONEON-NOSVE-NEXT: str wzr, [sp, #292] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #232] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #224] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: str wzr, [sp, #316] +; NONEON-NOSVE-NEXT: str wzr, [sp, #308] +; NONEON-NOSVE-NEXT: stp wzr, w9, [sp, #244] +; NONEON-NOSVE-NEXT: str w8, [sp, #240] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #216] +; NONEON-NOSVE-NEXT: ldp q3, q4, [sp, #224] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #364] +; NONEON-NOSVE-NEXT: str w9, [sp, #328] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #360] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #200] +; NONEON-NOSVE-NEXT: ldp q6, q7, [sp, #320] +; NONEON-NOSVE-NEXT: str w9, [sp, #296] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #208] +; NONEON-NOSVE-NEXT: str w9, [sp, #312] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #288] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #368 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = add <16 x i16> %a, %a @@ -1597,13 +4051,19 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v4i32_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %b = zext <4 x i32> %a to <4 x i64> store <4 x i64>%b, ptr %out @@ -1628,20 +4088,47 @@ define void @zext_v8i32_v8i64(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: zext_v8i32_v8i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #16] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #104] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #96] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #96] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #152] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #144] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: stp w9, wzr, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, wzr, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i32>, ptr %in %b = add <8 x i32> %a, %a @@ -1672,17 +4159,17 @@ define void @extend_and_mul(i32 %0, <2 x i64> %1, ptr %2) { ; ; NONEON-NOSVE-LABEL: extend_and_mul: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v1.2s, w0 -; NONEON-NOSVE-NEXT: fmov x10, d0 -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: fmov x11, d1 -; NONEON-NOSVE-NEXT: mov x9, v1.d[1] -; NONEON-NOSVE-NEXT: mul x10, x11, x10 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: mov w9, w0 +; NONEON-NOSVE-NEXT: mul x10, x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] ; NONEON-NOSVE-NEXT: mul x8, x9, x8 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: mov v0.d[1], x8 +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0 %broadcast.splat3 = shufflevector <2 x i32> %broadcast.splatinsert2, <2 x i32> poison, <2 x i32> zeroinitializer @@ -1702,9 +4189,12 @@ define void @extend_no_mul(i32 %0, <2 x i64> %1, ptr %2) { ; ; NONEON-NOSVE-LABEL: extend_no_mul: ; NONEON-NOSVE: // %bb.0: // %entry -; NONEON-NOSVE-NEXT: dup v0.2s, w0 -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 +; NONEON-NOSVE-NEXT: mov w8, w0 +; NONEON-NOSVE-NEXT: stp x8, x8, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret entry: %broadcast.splatinsert2 = insertelement <2 x i32> poison, i32 %0, i64 0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll index ade60b07150ce2..1f5bb5f5486af3 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll @@ -26,11 +26,108 @@ define void @add_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: add_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: add v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: add w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i32 0 @@ -51,12 +148,60 @@ define void @add_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: add_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: add v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: add w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -77,12 +222,32 @@ define void @add_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: add_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: add v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: add w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: add w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: add w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -103,12 +268,22 @@ define void @add_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: add_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: add v1.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: add v0.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: add x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: add x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: add x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: add x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -133,11 +308,108 @@ define void @and_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: and_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: and w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i32 0 @@ -158,12 +430,60 @@ define void @and_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: and_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: and w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -184,12 +504,32 @@ define void @and_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: and_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: and w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: and w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -210,12 +550,22 @@ define void @and_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: and_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: and x9, x8, #0x3f +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: and x8, x8, #0x3f +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: and x9, x8, #0x3f +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: and x8, x8, #0x3f +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -240,10 +590,108 @@ define void @ashr_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: ashr_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: cmlt v1.16b, v1.16b, #0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #15] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #13] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #11] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #10] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: lsr w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i32 0 @@ -264,10 +712,60 @@ define void @ashr_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: ashr_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v0.8h, v0.8h, #0 -; NONEON-NOSVE-NEXT: cmlt v1.8h, v1.8h, #0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: lsr w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -288,10 +786,32 @@ define void @ashr_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: ashr_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v0.4s, v0.4s, #0 -; NONEON-NOSVE-NEXT: cmlt v1.4s, v1.4s, #0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: asr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: asr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -312,10 +832,22 @@ define void @ashr_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: ashr_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v0.2d, v0.2d, #0 -; NONEON-NOSVE-NEXT: cmlt v1.2d, v1.2d, #0 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: asr x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: asr x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -343,11 +875,140 @@ define void @icmp_eq_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: icmp_eq_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmeq v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: cmeq v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #7 +; NONEON-NOSVE-NEXT: csetm w8, eq +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -372,12 +1033,76 @@ define void @icmp_sge_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: icmp_sge_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: cmge v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: cmge v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #14 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -402,12 +1127,40 @@ define void @icmp_sgt_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: icmp_sgt_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #-8 // =0xfffffff8 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: cmgt v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: cmgt v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: csetm w9, gt +; NONEON-NOSVE-NEXT: cmn w8, #8 +; NONEON-NOSVE-NEXT: csetm w8, gt +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 -8, i64 0 @@ -432,12 +1185,26 @@ define void @icmp_ult_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: icmp_ult_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: cmhi v1.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: cmhi v0.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csetm x9, lo +; NONEON-NOSVE-NEXT: cmp x8, #63 +; NONEON-NOSVE-NEXT: csetm x8, lo +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: csetm x9, lo +; NONEON-NOSVE-NEXT: cmp x8, #63 +; NONEON-NOSVE-NEXT: csetm x8, lo +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -463,10 +1230,108 @@ define void @lshr_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: lshr_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ushr v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: ushr v1.16b, v1.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #7, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -487,10 +1352,60 @@ define void @lshr_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: lshr_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ushr v0.8h, v0.8h, #15 -; NONEON-NOSVE-NEXT: ushr v1.8h, v1.8h, #15 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w8, w8, #15, #1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -511,10 +1426,32 @@ define void @lshr_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: lshr_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ushr v0.4s, v0.4s, #31 -; NONEON-NOSVE-NEXT: ushr v1.4s, v1.4s, #31 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsr w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: lsr w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -535,10 +1472,22 @@ define void @lshr_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: lshr_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ushr v0.2d, v0.2d, #63 -; NONEON-NOSVE-NEXT: ushr v1.2d, v1.2d, #63 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: lsr x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: lsr x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -563,11 +1512,140 @@ define void @mul_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: mul_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: mul v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: mul v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: lsl w9, w8, #3 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -588,12 +1666,76 @@ define void @mul_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: mul_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: mul v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: mul v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: lsl w9, w8, #4 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -614,12 +1756,44 @@ define void @mul_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: mul_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: mul v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: mul v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: lsl w9, w8, #5 +; NONEON-NOSVE-NEXT: sub w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -640,24 +1814,28 @@ define void @mul_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: mul_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: fmov x10, d0 -; NONEON-NOSVE-NEXT: fmov x11, d1 -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: mov x9, v1.d[1] -; NONEON-NOSVE-NEXT: lsl x12, x10, #6 -; NONEON-NOSVE-NEXT: lsl x13, x11, #6 -; NONEON-NOSVE-NEXT: lsl x14, x8, #6 -; NONEON-NOSVE-NEXT: sub x10, x12, x10 -; NONEON-NOSVE-NEXT: sub x11, x13, x11 -; NONEON-NOSVE-NEXT: lsl x12, x9, #6 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: fmov d1, x11 -; NONEON-NOSVE-NEXT: sub x8, x14, x8 -; NONEON-NOSVE-NEXT: sub x9, x12, x9 -; NONEON-NOSVE-NEXT: mov v0.d[1], x8 -; NONEON-NOSVE-NEXT: mov v1.d[1], x9 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl x9, x8, #6 +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #56] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl x9, x8, #6 +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl x9, x8, #6 +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: lsl x9, x8, #6 +; NONEON-NOSVE-NEXT: sub x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -682,11 +1860,108 @@ define void @or_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: or_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -707,12 +1982,60 @@ define void @or_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: or_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -733,12 +2056,32 @@ define void @or_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: or_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: orr w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: orr w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: orr w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -759,12 +2102,22 @@ define void @or_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: or_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: orr v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: orr x9, x8, #0x3f +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: orr x8, x8, #0x3f +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: orr x9, x8, #0x3f +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: orr x8, x8, #0x3f +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -789,10 +2142,108 @@ define void @shl_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: shl_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: shl v1.16b, v1.16b, #7 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: lsl w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -813,10 +2264,60 @@ define void @shl_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: shl_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: shl v0.8h, v0.8h, #15 -; NONEON-NOSVE-NEXT: shl v1.8h, v1.8h, #15 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: lsl w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -837,10 +2338,32 @@ define void @shl_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: shl_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: shl v0.4s, v0.4s, #31 -; NONEON-NOSVE-NEXT: shl v1.4s, v1.4s, #31 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: lsl w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: lsl w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -861,10 +2384,22 @@ define void @shl_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: shl_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: shl v0.2d, v0.2d, #63 -; NONEON-NOSVE-NEXT: shl v1.2d, v1.2d, #63 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: lsl x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: lsl x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -889,11 +2424,141 @@ define void @smax_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: smax_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smax v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: smax v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: mov w8, #7 // =0x7 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #61] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w9, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -914,12 +2579,77 @@ define void @smax_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: smax_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: smax v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: smax v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -940,12 +2670,41 @@ define void @smax_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: smax_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: smax v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: smax v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, gt +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, gt +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, gt +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, gt +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -966,14 +2725,27 @@ define void @smax_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: smax_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: cmgt v3.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: cmgt v4.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: bif v1.16b, v0.16b, v3.16b -; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, gt +; NONEON-NOSVE-NEXT: ldr x9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x9, x9, x8, gt +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, gt +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, gt +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -998,11 +2770,141 @@ define void @smin_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: smin_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smin v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: smin v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: mov w8, #7 // =0x7 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #61] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w9, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -1023,12 +2925,77 @@ define void @smin_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: smin_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: smin v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: smin v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -1049,12 +3016,41 @@ define void @smin_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: smin_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: smin v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: smin v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lt +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lt +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lt +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lt +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -1075,14 +3071,27 @@ define void @smin_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: smin_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: cmgt v3.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: cmgt v4.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: bif v1.16b, v0.16b, v3.16b -; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, lt +; NONEON-NOSVE-NEXT: ldr x9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x9, x9, x8, lt +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, lt +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lt +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -1107,11 +3116,108 @@ define void @sub_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: sub_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: sub v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: sub v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: sub w8, w8, #7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -1132,12 +3238,60 @@ define void @sub_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: sub_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: sub v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: sub v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: sub w8, w8, #15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -1158,12 +3312,32 @@ define void @sub_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: sub_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: sub v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: sub v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: sub w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: sub w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sub w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sub w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: sub w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: sub w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: sub w9, w8, #31 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: sub w8, w8, #31 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -1184,12 +3358,22 @@ define void @sub_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: sub_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: sub v1.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: sub v0.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: sub x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: sub x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: sub x9, x8, #63 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: sub x8, x8, #63 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -1214,11 +3398,141 @@ define void @umax_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: umax_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umax v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: umax v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: mov w8, #7 // =0x7 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #31] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #30] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #29] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #28] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #27] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #26] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #25] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #24] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #23] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #22] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #21] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #20] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #18] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #17] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w9, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: tst w9, #0xf8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -1239,12 +3553,77 @@ define void @umax_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: umax_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: umax v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: umax v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #30] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #28] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #24] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #18] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: tst w9, #0xfff0 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -1265,12 +3644,41 @@ define void @umax_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: umax_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: umax v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umax v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, hi +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, hi +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, hi +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, hi +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -1291,14 +3699,27 @@ define void @umax_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: umax_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: cmhi v3.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: cmhi v4.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: bif v1.16b, v0.16b, v3.16b -; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, hi +; NONEON-NOSVE-NEXT: ldr x9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x9, x9, x8, hi +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, hi +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, hi +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -1323,11 +3744,141 @@ define void @umin_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: umin_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umin v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: umin v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: mov w8, #7 // =0x7 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w9, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #7 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -1348,12 +3899,77 @@ define void @umin_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: umin_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: umin v1.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: umin v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w9, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #15 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -1374,12 +3990,41 @@ define void @umin_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: umin_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: umin v1.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umin v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lo +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lo +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lo +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w9, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w10, w9, w8, lo +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #31 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -1400,14 +4045,27 @@ define void @umin_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: umin_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] ; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: cmhi v3.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: cmhi v4.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: bif v1.16b, v0.16b, v3.16b -; NONEON-NOSVE-NEXT: bit v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, lo +; NONEON-NOSVE-NEXT: ldr x9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x9, x9, x8, lo +; NONEON-NOSVE-NEXT: stp x9, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x10, x9, x8, lo +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: cmp x9, #63 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lo +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 @@ -1432,11 +4090,108 @@ define void @xor_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: xor_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #7 -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %ins = insertelement <32 x i8> undef, i8 7, i64 0 @@ -1457,12 +4212,60 @@ define void @xor_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: xor_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #15 // =0xf -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: eor w8, w8, #0xf +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %ins = insertelement <16 x i16> undef, i16 15, i64 0 @@ -1483,12 +4286,32 @@ define void @xor_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: xor_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: eor w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: eor w9, w8, #0x1f +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: eor w8, w8, #0x1f +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %ins = insertelement <8 x i32> undef, i32 31, i64 0 @@ -1509,12 +4332,22 @@ define void @xor_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: xor_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #63 // =0x3f -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: eor v1.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: eor x9, x8, #0x3f +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: eor x8, x8, #0x3f +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: eor x9, x8, #0x3f +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: eor x8, x8, #0x3f +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %ins = insertelement <4 x i64> undef, i64 63, i64 0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll index 4fc7ec3a8439df..3137a7bc7ad270 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll @@ -20,7 +20,43 @@ define <8 x i8> @and_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: and_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = and <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -37,7 +73,74 @@ define <16 x i8> @and_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: and_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = and <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -55,11 +158,143 @@ define void @and_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: and_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -79,7 +314,27 @@ define <4 x i16> @and_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: and_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = and <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -96,7 +351,42 @@ define <8 x i16> @and_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: and_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = and <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -114,11 +404,79 @@ define void @and_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: and_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -138,7 +496,18 @@ define <2 x i32> @and_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: and_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = and <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -155,7 +524,24 @@ define <4 x i32> @and_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: and_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = and <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -173,11 +559,43 @@ define void @and_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: and_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -197,7 +615,14 @@ define <1 x i64> @and_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: and_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: and x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = and <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -214,7 +639,17 @@ define <2 x i64> @and_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: and_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: and x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: and x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = and <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -232,11 +667,29 @@ define void @and_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: and_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: and v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: and x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: and x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: and x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: and x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -260,7 +713,43 @@ define <8 x i8> @or_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: or_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = or <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -277,7 +766,74 @@ define <16 x i8> @or_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: or_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = or <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -295,11 +851,143 @@ define void @or_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: or_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -319,7 +1007,27 @@ define <4 x i16> @or_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: or_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = or <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -336,7 +1044,42 @@ define <8 x i16> @or_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: or_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = or <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -354,11 +1097,79 @@ define void @or_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: or_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -378,7 +1189,18 @@ define <2 x i32> @or_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: or_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = or <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -395,7 +1217,24 @@ define <4 x i32> @or_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: or_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = or <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -413,11 +1252,43 @@ define void @or_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: or_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -437,7 +1308,14 @@ define <1 x i64> @or_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: or_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: orr x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = or <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -454,7 +1332,17 @@ define <2 x i64> @or_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: or_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: orr v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: orr x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: orr x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = or <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -472,11 +1360,29 @@ define void @or_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: or_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: orr v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: orr x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: orr x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: orr x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: orr x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -500,7 +1406,43 @@ define <8 x i8> @xor_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = xor <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -517,7 +1459,74 @@ define <16 x i8> @xor_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = xor <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -535,11 +1544,143 @@ define void @xor_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: xor_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -559,7 +1700,27 @@ define <4 x i16> @xor_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = xor <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -576,7 +1737,42 @@ define <8 x i16> @xor_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = xor <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -594,11 +1790,79 @@ define void @xor_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: xor_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -618,7 +1882,18 @@ define <2 x i32> @xor_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = xor <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -635,7 +1910,24 @@ define <4 x i32> @xor_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = xor <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -653,11 +1945,43 @@ define void @xor_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: xor_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -677,7 +2001,14 @@ define <1 x i64> @xor_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: eor x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = xor <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -694,7 +2025,17 @@ define <2 x i64> @xor_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: xor_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: eor v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: eor x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: eor x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = xor <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -712,11 +2053,29 @@ define void @xor_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: xor_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: eor v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: eor x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: eor x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: eor x8, x10, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: eor x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll index b9c859a58611e8..4775a965b70d77 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll @@ -21,7 +21,51 @@ define <8 x i8> @smax_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smax v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %op1, <8 x i8> %op2) ret <8 x i8> %res @@ -39,7 +83,90 @@ define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smax v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %op1, <16 x i8> %op2) ret <16 x i8> %res @@ -59,11 +186,175 @@ define void @smax_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smax_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smax v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: smax v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #47] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #61] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #59] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #57] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #55] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #53] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #51] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #49] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -84,7 +375,31 @@ define <4 x i16> @smax_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smax v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %op1, <4 x i16> %op2) ret <4 x i16> %res @@ -102,7 +417,50 @@ define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smax v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %op1, <8 x i16> %op2) ret <8 x i16> %res @@ -122,11 +480,95 @@ define void @smax_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smax_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smax v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: smax v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -147,7 +589,19 @@ define <2 x i32> @smax_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smax v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %op1, <2 x i32> %op2) ret <2 x i32> %res @@ -165,7 +619,26 @@ define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smax v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %op1, <4 x i32> %op2) ret <4 x i32> %res @@ -185,11 +658,47 @@ define void @smax_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smax_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smax v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: smax v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, gt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -211,8 +720,15 @@ define <1 x i64> @smax_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmgt d2, d0, d1 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, gt +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %op1, <1 x i64> %op2) ret <1 x i64> %res @@ -231,8 +747,18 @@ define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: smax_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmgt v2.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, gt +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, gt +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %op1, <2 x i64> %op2) ret <2 x i64> %res @@ -252,14 +778,31 @@ define void @smax_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smax_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmgt v4.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: cmgt v5.2d, v2.2d, v3.2d -; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, gt +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, gt +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, gt +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, gt +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -284,7 +827,51 @@ define <8 x i8> @smin_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smin v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %op1, <8 x i8> %op2) ret <8 x i8> %res @@ -302,7 +889,90 @@ define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smin v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %op1, <16 x i8> %op2) ret <16 x i8> %res @@ -322,11 +992,175 @@ define void @smin_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smin_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smin v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: smin v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #47] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #61] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #59] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #57] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #55] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #53] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #51] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #49] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -347,7 +1181,31 @@ define <4 x i16> @smin_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smin v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %op1, <4 x i16> %op2) ret <4 x i16> %res @@ -365,7 +1223,50 @@ define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smin v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %op1, <8 x i16> %op2) ret <8 x i16> %res @@ -385,11 +1286,95 @@ define void @smin_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smin_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smin v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: smin v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -410,7 +1395,19 @@ define <2 x i32> @smin_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smin v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %op1, <2 x i32> %op2) ret <2 x i32> %res @@ -428,7 +1425,26 @@ define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smin v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %op1, <4 x i32> %op2) ret <4 x i32> %res @@ -448,11 +1464,47 @@ define void @smin_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smin_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smin v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: smin v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lt +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -474,8 +1526,15 @@ define <1 x i64> @smin_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmgt d2, d1, d0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lt +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %op1, <1 x i64> %op2) ret <1 x i64> %res @@ -494,8 +1553,18 @@ define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: smin_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmgt v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, lt +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lt +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %op1, <2 x i64> %op2) ret <2 x i64> %res @@ -515,14 +1584,31 @@ define void @smin_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smin_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmgt v4.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: cmgt v5.2d, v3.2d, v2.2d -; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, lt +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lt +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, lt +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lt +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -547,7 +1633,51 @@ define <8 x i8> @umax_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umax v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %op1, <8 x i8> %op2) ret <8 x i8> %res @@ -565,7 +1695,90 @@ define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umax v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %op1, <16 x i8> %op2) ret <16 x i8> %res @@ -585,11 +1798,175 @@ define void @umax_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umax_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umax v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: umax v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -610,7 +1987,31 @@ define <4 x i16> @umax_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umax v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %op1, <4 x i16> %op2) ret <4 x i16> %res @@ -628,7 +2029,50 @@ define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umax v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %op1, <8 x i16> %op2) ret <8 x i16> %res @@ -648,11 +2092,95 @@ define void @umax_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umax_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umax v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: umax v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -673,7 +2201,19 @@ define <2 x i32> @umax_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umax v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %op1, <2 x i32> %op2) ret <2 x i32> %res @@ -691,7 +2231,26 @@ define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umax v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %op1, <4 x i32> %op2) ret <4 x i32> %res @@ -711,11 +2270,47 @@ define void @umax_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umax_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umax v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umax v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, hi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -737,8 +2332,15 @@ define <1 x i64> @umax_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmhi d2, d0, d1 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, hi +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %op1, <1 x i64> %op2) ret <1 x i64> %res @@ -757,8 +2359,18 @@ define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: umax_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmhi v2.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, hi +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, hi +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %op1, <2 x i64> %op2) ret <2 x i64> %res @@ -778,14 +2390,31 @@ define void @umax_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umax_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmhi v4.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: cmhi v5.2d, v2.2d, v3.2d -; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, hi +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, hi +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, hi +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, hi +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -810,7 +2439,51 @@ define <8 x i8> @umin_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umin v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %op1, <8 x i8> %op2) ret <8 x i8> %res @@ -828,7 +2501,90 @@ define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umin v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %op1, <16 x i8> %op2) ret <16 x i8> %res @@ -848,11 +2604,175 @@ define void @umin_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umin_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umin v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: umin v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -873,7 +2793,31 @@ define <4 x i16> @umin_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umin v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %op1, <4 x i16> %op2) ret <4 x i16> %res @@ -891,7 +2835,50 @@ define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umin v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %op1, <8 x i16> %op2) ret <8 x i16> %res @@ -911,11 +2898,95 @@ define void @umin_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umin_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umin v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: umin v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -936,7 +3007,19 @@ define <2 x i32> @umin_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umin v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %op1, <2 x i32> %op2) ret <2 x i32> %res @@ -954,7 +3037,26 @@ define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umin v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %op1, <4 x i32> %op2) ret <4 x i32> %res @@ -974,11 +3076,47 @@ define void @umin_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umin_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umin v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umin v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w10, w8 +; NONEON-NOSVE-NEXT: csel w11, w10, w8, lo +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -1000,8 +3138,15 @@ define <1 x i64> @umin_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmhi d2, d1, d0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lo +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %op1, <1 x i64> %op2) ret <1 x i64> %res @@ -1020,8 +3165,18 @@ define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: umin_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmhi v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, lo +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lo +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %op1, <2 x i64> %op2) ret <2 x i64> %res @@ -1041,14 +3196,31 @@ define void @umin_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umin_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: cmhi v4.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: cmhi v5.2d, v3.2d, v2.2d -; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, lo +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lo +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x10, x8 +; NONEON-NOSVE-NEXT: csel x11, x10, x8, lo +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, lo +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll index 3a03de3442d581..94d5bb1543b0e0 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mla-neon-fa64.ll @@ -24,8 +24,51 @@ define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { ; ; NONEON-NOSVE-LABEL: mla8xi8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mla v2.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #7] +; NONEON-NOSVE-NEXT: str d2, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #6] +; NONEON-NOSVE-NEXT: madd w1, w2, w1, w5 +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #2] +; NONEON-NOSVE-NEXT: strb w1, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #9] +; NONEON-NOSVE-NEXT: madd w1, w4, w3, w1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w1, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #21] +; NONEON-NOSVE-NEXT: madd w18, w0, w18, w1 +; NONEON-NOSVE-NEXT: strb w18, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #20] +; NONEON-NOSVE-NEXT: madd w16, w17, w16, w18 +; NONEON-NOSVE-NEXT: strb w16, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #19] +; NONEON-NOSVE-NEXT: madd w14, w15, w14, w16 +; NONEON-NOSVE-NEXT: strb w14, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #18] +; NONEON-NOSVE-NEXT: madd w12, w13, w12, w14 +; NONEON-NOSVE-NEXT: strb w12, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #17] +; NONEON-NOSVE-NEXT: madd w10, w11, w10, w12 +; NONEON-NOSVE-NEXT: strb w10, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: madd w8, w9, w8, w10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %tmp1 = mul <8 x i8> %A, %B; %tmp2 = add <8 x i8> %C, %tmp1; diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll index 1ed3d8fa39d8da..6198926c0b4381 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -40,12 +40,31 @@ define <4 x i8> @smulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: mul v0.4h, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: ushr v0.4h, v0.4h, #4 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w8, w12 +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #16] +; NONEON-NOSVE-NEXT: mul w9, w9, w13 +; NONEON-NOSVE-NEXT: mul w10, w10, w14 +; NONEON-NOSVE-NEXT: mul w11, w11, w12 +; NONEON-NOSVE-NEXT: ubfx w8, w8, #4, #12 +; NONEON-NOSVE-NEXT: ubfx w9, w9, #4, #12 +; NONEON-NOSVE-NEXT: ubfx w10, w10, #4, #12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ubfx w8, w11, #4, #12 +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i16> undef, i16 4, i64 0 %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer @@ -77,8 +96,51 @@ define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smull v0.8h, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: shrn v0.8b, v0.8h, #8 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w15, [sp, #15] +; NONEON-NOSVE-NEXT: ldrsb w16, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #13] +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsb w17, [sp, #22] +; NONEON-NOSVE-NEXT: mul w15, w15, w16 +; NONEON-NOSVE-NEXT: ldrsb w16, [sp, #21] +; NONEON-NOSVE-NEXT: ldrsb w18, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: mul w14, w14, w17 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #10] +; NONEON-NOSVE-NEXT: mul w13, w13, w16 +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #11] +; NONEON-NOSVE-NEXT: ldrsb w17, [sp, #16] +; NONEON-NOSVE-NEXT: mul w12, w12, w18 +; NONEON-NOSVE-NEXT: lsr w15, w15, #8 +; NONEON-NOSVE-NEXT: ldrsb w0, [sp, #19] +; NONEON-NOSVE-NEXT: ldrsb w16, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w14, w14, #8 +; NONEON-NOSVE-NEXT: ldrsb w18, [sp, #17] +; NONEON-NOSVE-NEXT: mul w8, w8, w17 +; NONEON-NOSVE-NEXT: lsr w13, w13, #8 +; NONEON-NOSVE-NEXT: mul w11, w11, w0 +; NONEON-NOSVE-NEXT: lsr w12, w12, #8 +; NONEON-NOSVE-NEXT: strb w15, [sp, #31] +; NONEON-NOSVE-NEXT: mul w10, w10, w16 +; NONEON-NOSVE-NEXT: strb w14, [sp, #30] +; NONEON-NOSVE-NEXT: mul w9, w9, w18 +; NONEON-NOSVE-NEXT: lsr w8, w8, #8 +; NONEON-NOSVE-NEXT: strb w13, [sp, #29] +; NONEON-NOSVE-NEXT: lsr w11, w11, #8 +; NONEON-NOSVE-NEXT: strb w12, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w10, w10, #8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w9, w9, #8 +; NONEON-NOSVE-NEXT: strb w11, [sp, #27] +; NONEON-NOSVE-NEXT: strb w10, [sp, #26] +; NONEON-NOSVE-NEXT: strb w9, [sp, #25] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i16> undef, i16 8, i64 0 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer @@ -110,9 +172,116 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smull2 v2.8h, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: smull v0.8h, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: uzp2 v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: str x27, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #96] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #112] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #128] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #144] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -80 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w6, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w7, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w19, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w20, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #49] +; NONEON-NOSVE-NEXT: str d0, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #51] +; NONEON-NOSVE-NEXT: ldrsb w21, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w23, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w25, [sp, #61] +; NONEON-NOSVE-NEXT: ldrsb w26, [sp, #60] +; NONEON-NOSVE-NEXT: str d1, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #52] +; NONEON-NOSVE-NEXT: mul w20, w20, w21 +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #53] +; NONEON-NOSVE-NEXT: ldrsb w15, [sp, #54] +; NONEON-NOSVE-NEXT: mul w19, w19, w23 +; NONEON-NOSVE-NEXT: ldrsb w17, [sp, #55] +; NONEON-NOSVE-NEXT: ldrsb w0, [sp, #40] +; NONEON-NOSVE-NEXT: mul w7, w7, w25 +; NONEON-NOSVE-NEXT: ldrsb w2, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w3, [sp, #42] +; NONEON-NOSVE-NEXT: mul w6, w6, w26 +; NONEON-NOSVE-NEXT: lsr w20, w20, #8 +; NONEON-NOSVE-NEXT: ldrsb w4, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #88] +; NONEON-NOSVE-NEXT: lsr w19, w19, #8 +; NONEON-NOSVE-NEXT: ldrsb w16, [sp, #89] +; NONEON-NOSVE-NEXT: ldrsb w18, [sp, #90] +; NONEON-NOSVE-NEXT: lsr w7, w7, #8 +; NONEON-NOSVE-NEXT: ldrsb w1, [sp, #91] +; NONEON-NOSVE-NEXT: ldrsb w5, [sp, #92] +; NONEON-NOSVE-NEXT: mul w9, w9, w16 +; NONEON-NOSVE-NEXT: lsr w6, w6, #8 +; NONEON-NOSVE-NEXT: ldrsb w22, [sp, #93] +; NONEON-NOSVE-NEXT: ldrsb w24, [sp, #94] +; NONEON-NOSVE-NEXT: mul w11, w11, w1 +; NONEON-NOSVE-NEXT: ldrsb w21, [sp, #95] +; NONEON-NOSVE-NEXT: ldrsb w23, [sp, #56] +; NONEON-NOSVE-NEXT: mul w12, w12, w5 +; NONEON-NOSVE-NEXT: ldrsb w27, [sp, #59] +; NONEON-NOSVE-NEXT: ldrsb w25, [sp, #58] +; NONEON-NOSVE-NEXT: mul w15, w15, w24 +; NONEON-NOSVE-NEXT: ldrsb w26, [sp, #57] +; NONEON-NOSVE-NEXT: mul w0, w0, w23 +; NONEON-NOSVE-NEXT: lsr w11, w11, #8 +; NONEON-NOSVE-NEXT: mul w4, w4, w27 +; NONEON-NOSVE-NEXT: lsr w12, w12, #8 +; NONEON-NOSVE-NEXT: lsr w9, w9, #8 +; NONEON-NOSVE-NEXT: mul w3, w3, w25 +; NONEON-NOSVE-NEXT: lsr w15, w15, #8 +; NONEON-NOSVE-NEXT: strb w20, [sp, #79] +; NONEON-NOSVE-NEXT: mul w2, w2, w26 +; NONEON-NOSVE-NEXT: lsr w0, w0, #8 +; NONEON-NOSVE-NEXT: strb w19, [sp, #78] +; NONEON-NOSVE-NEXT: mul w17, w17, w21 +; NONEON-NOSVE-NEXT: lsr w4, w4, #8 +; NONEON-NOSVE-NEXT: strb w7, [sp, #77] +; NONEON-NOSVE-NEXT: mul w13, w13, w22 +; NONEON-NOSVE-NEXT: lsr w3, w3, #8 +; NONEON-NOSVE-NEXT: strb w6, [sp, #76] +; NONEON-NOSVE-NEXT: mul w10, w10, w18 +; NONEON-NOSVE-NEXT: lsr w2, w2, #8 +; NONEON-NOSVE-NEXT: strb w4, [sp, #75] +; NONEON-NOSVE-NEXT: mul w8, w8, w14 +; NONEON-NOSVE-NEXT: lsr w17, w17, #8 +; NONEON-NOSVE-NEXT: strb w3, [sp, #74] +; NONEON-NOSVE-NEXT: lsr w13, w13, #8 +; NONEON-NOSVE-NEXT: strb w2, [sp, #73] +; NONEON-NOSVE-NEXT: ldr x27, [sp, #80] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w10, w10, #8 +; NONEON-NOSVE-NEXT: strb w0, [sp, #72] +; NONEON-NOSVE-NEXT: lsr w8, w8, #8 +; NONEON-NOSVE-NEXT: strb w17, [sp, #71] +; NONEON-NOSVE-NEXT: strb w15, [sp, #70] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #144] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w13, [sp, #69] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #128] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w12, [sp, #68] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #112] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w11, [sp, #67] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #96] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w10, [sp, #66] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %1 = sext <16 x i8> %op1 to <16 x i16> %2 = sext <16 x i8> %op2 to <16 x i16> @@ -145,15 +314,251 @@ define void @smulh_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smulh_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smull2 v4.8h, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: smull v0.8h, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: smull2 v1.8h, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: smull v2.8h, v2.8b, v3.8b -; NONEON-NOSVE-NEXT: uzp2 v0.16b, v0.16b, v4.16b -; NONEON-NOSVE-NEXT: uzp2 v1.16b, v2.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #384 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #288] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #304] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #320] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #336] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #352] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 384 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: mov x29, x0 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: str q1, [sp, #160] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #128] +; NONEON-NOSVE-NEXT: str q3, [sp, #144] +; NONEON-NOSVE-NEXT: str q2, [sp, #192] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #176] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #160] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #184] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #185] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #186] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #187] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #144] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #188] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #189] +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #229] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #227] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #228] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #190] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #191] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #192] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #177] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #226] +; NONEON-NOSVE-NEXT: ldrsb w2, [sp, #214] +; NONEON-NOSVE-NEXT: ldrsb w1, [sp, #215] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #178] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #179] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240] +; NONEON-NOSVE-NEXT: ldrsb w4, [sp, #212] +; NONEON-NOSVE-NEXT: ldrsb w3, [sp, #213] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #180] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #181] +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #247] +; NONEON-NOSVE-NEXT: ldrsb w15, [sp, #246] +; NONEON-NOSVE-NEXT: ldrsb w16, [sp, #244] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #182] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #183] +; NONEON-NOSVE-NEXT: mul w26, w12, w16 +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #242] +; NONEON-NOSVE-NEXT: ldrsb w16, [sp, #250] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #232] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #233] +; NONEON-NOSVE-NEXT: mul w30, w10, w12 +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #255] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #253] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #234] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #235] +; NONEON-NOSVE-NEXT: ldrsb w0, [sp, #248] +; NONEON-NOSVE-NEXT: ldrsb w18, [sp, #249] +; NONEON-NOSVE-NEXT: ldrsb w6, [sp, #210] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #236] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #237] +; NONEON-NOSVE-NEXT: ldrsb w5, [sp, #211] +; NONEON-NOSVE-NEXT: ldrsb w19, [sp, #208] +; NONEON-NOSVE-NEXT: ldrsb w7, [sp, #209] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #238] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #239] +; NONEON-NOSVE-NEXT: ldrsb w21, [sp, #222] +; NONEON-NOSVE-NEXT: ldrsb w20, [sp, #223] +; NONEON-NOSVE-NEXT: ldrsb w23, [sp, #220] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #224] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #225] +; NONEON-NOSVE-NEXT: ldrsb w22, [sp, #221] +; NONEON-NOSVE-NEXT: ldrsb w24, [sp, #219] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #230] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #231] +; NONEON-NOSVE-NEXT: mul w27, w8, w14 +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #245] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #217] +; NONEON-NOSVE-NEXT: mul w9, w9, w15 +; NONEON-NOSVE-NEXT: ldrsb w15, [sp, #251] +; NONEON-NOSVE-NEXT: mul w25, w13, w14 +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #243] +; NONEON-NOSVE-NEXT: lsr w14, w27, #8 +; NONEON-NOSVE-NEXT: ldrsb w27, [sp, #218] +; NONEON-NOSVE-NEXT: lsr w17, w9, #8 +; NONEON-NOSVE-NEXT: mul w28, w11, w13 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #216] +; NONEON-NOSVE-NEXT: strb w14, [sp, #287] +; NONEON-NOSVE-NEXT: lsr w14, w25, #8 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #241] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #240] +; NONEON-NOSVE-NEXT: strb w14, [sp, #285] +; NONEON-NOSVE-NEXT: lsr w14, w28, #8 +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #254] +; NONEON-NOSVE-NEXT: mul w8, w25, w8 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #252] +; NONEON-NOSVE-NEXT: strb w14, [sp, #283] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #40] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w9, w25, w9 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w17, [sp, #286] +; NONEON-NOSVE-NEXT: mul w12, w14, w12 +; NONEON-NOSVE-NEXT: lsr w8, w8, #8 +; NONEON-NOSVE-NEXT: lsr w17, w26, #8 +; NONEON-NOSVE-NEXT: mul w10, w25, w10 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldr w14, [sp, #44] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w9, w9, #8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #281] +; NONEON-NOSVE-NEXT: mul w11, w25, w11 +; NONEON-NOSVE-NEXT: strb w17, [sp, #284] +; NONEON-NOSVE-NEXT: lsr w17, w30, #8 +; NONEON-NOSVE-NEXT: mul w13, w14, w13 +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #280] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #320] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: mul w10, w10, w15 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #52] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #279] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #56] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w11, w11, w16 +; NONEON-NOSVE-NEXT: strb w9, [sp, #278] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: mul w12, w12, w18 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #277] +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #64] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #276] +; NONEON-NOSVE-NEXT: mul w13, w13, w0 +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #68] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w10, w10, w1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #275] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: mul w11, w11, w2 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #274] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #76] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #273] +; NONEON-NOSVE-NEXT: mul w12, w12, w3 +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #80] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w13, w13, w4 +; NONEON-NOSVE-NEXT: strb w9, [sp, #272] +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: mul w10, w10, w5 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #271] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #88] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #270] +; NONEON-NOSVE-NEXT: mul w11, w11, w6 +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #92] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w12, w12, w7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #269] +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: mul w13, w13, w19 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #268] +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #267] +; NONEON-NOSVE-NEXT: mul w10, w10, w20 +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w11, w11, w21 +; NONEON-NOSVE-NEXT: strb w9, [sp, #266] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w12, w12, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #265] +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #264] +; NONEON-NOSVE-NEXT: mul w13, w13, w23 +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp w15, w14, [sp, #16] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w10, w10, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #263] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: mul w11, w11, w27 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #262] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #124] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #261] +; NONEON-NOSVE-NEXT: mul w12, w12, w15 +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: strb w17, [sp, #282] +; NONEON-NOSVE-NEXT: mul w13, w13, w14 +; NONEON-NOSVE-NEXT: strb w9, [sp, #260] +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #259] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: strb w9, [sp, #258] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #352] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #257] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #336] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #256] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #304] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #256] +; NONEON-NOSVE-NEXT: stp q0, q1, [x29] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #288] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #384 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -193,12 +598,20 @@ define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: shl v1.2s, v1.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16 -; NONEON-NOSVE-NEXT: mul v0.2s, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: ushr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w8, w10 +; NONEON-NOSVE-NEXT: mul w9, w9, w11 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = sext <2 x i16> %op1 to <2 x i32> %2 = sext <2 x i16> %op2 to <2 x i32> @@ -228,8 +641,31 @@ define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smull v0.4s, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: shrn v0.4h, v0.4s, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsh w12, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsh w13, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w14, [sp, #18] +; NONEON-NOSVE-NEXT: mul w11, w11, w12 +; NONEON-NOSVE-NEXT: ldrsh w12, [sp, #16] +; NONEON-NOSVE-NEXT: mul w10, w10, w13 +; NONEON-NOSVE-NEXT: mul w9, w9, w14 +; NONEON-NOSVE-NEXT: mul w8, w8, w12 +; NONEON-NOSVE-NEXT: lsr w11, w11, #16 +; NONEON-NOSVE-NEXT: lsr w10, w10, #16 +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: strh w11, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w10, [sp, #28] +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = sext <4 x i16> %op1 to <4 x i32> %2 = sext <4 x i16> %op2 to <4 x i32> @@ -259,9 +695,54 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smull2 v2.4s, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: smull v0.4s, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w15, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w12, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w13, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w14, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w16, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w17, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w18, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w0, [sp, #62] +; NONEON-NOSVE-NEXT: mul w15, w15, w16 +; NONEON-NOSVE-NEXT: ldrsh w16, [sp, #48] +; NONEON-NOSVE-NEXT: mul w14, w14, w17 +; NONEON-NOSVE-NEXT: ldrsh w17, [sp, #56] +; NONEON-NOSVE-NEXT: mul w13, w13, w18 +; NONEON-NOSVE-NEXT: ldrsh w18, [sp, #60] +; NONEON-NOSVE-NEXT: mul w12, w12, w16 +; NONEON-NOSVE-NEXT: ldrsh w16, [sp, #58] +; NONEON-NOSVE-NEXT: lsr w15, w15, #16 +; NONEON-NOSVE-NEXT: mul w11, w11, w0 +; NONEON-NOSVE-NEXT: lsr w14, w14, #16 +; NONEON-NOSVE-NEXT: mul w10, w10, w18 +; NONEON-NOSVE-NEXT: lsr w13, w13, #16 +; NONEON-NOSVE-NEXT: strh w15, [sp, #78] +; NONEON-NOSVE-NEXT: mul w9, w9, w16 +; NONEON-NOSVE-NEXT: lsr w12, w12, #16 +; NONEON-NOSVE-NEXT: strh w14, [sp, #76] +; NONEON-NOSVE-NEXT: mul w8, w8, w17 +; NONEON-NOSVE-NEXT: lsr w11, w11, #16 +; NONEON-NOSVE-NEXT: strh w13, [sp, #74] +; NONEON-NOSVE-NEXT: lsr w10, w10, #16 +; NONEON-NOSVE-NEXT: strh w12, [sp, #72] +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: strh w11, [sp, #70] +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w10, [sp, #68] +; NONEON-NOSVE-NEXT: strh w9, [sp, #66] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %1 = sext <8 x i16> %op1 to <8 x i32> %2 = sext <8 x i16> %op2 to <8 x i32> @@ -294,15 +775,125 @@ define void @smulh_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smulh_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smull2 v4.4s, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: smull v0.4s, v1.4h, v0.4h -; NONEON-NOSVE-NEXT: smull2 v1.4s, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: smull v2.4s, v2.4h, v3.4h -; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp2 v1.8h, v2.8h, v1.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #240 +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #160] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #208] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #224] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 240 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: str q3, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w12, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w13, [sp, #50] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w7, [sp, #96] +; NONEON-NOSVE-NEXT: ldrsh w19, [sp, #98] +; NONEON-NOSVE-NEXT: ldrsh w20, [sp, #100] +; NONEON-NOSVE-NEXT: ldrsh w21, [sp, #102] +; NONEON-NOSVE-NEXT: ldrsh w14, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w16, [sp, #54] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w18, [sp, #104] +; NONEON-NOSVE-NEXT: ldrsh w2, [sp, #106] +; NONEON-NOSVE-NEXT: ldrsh w4, [sp, #108] +; NONEON-NOSVE-NEXT: ldrsh w5, [sp, #110] +; NONEON-NOSVE-NEXT: ldrsh w15, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w17, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w1, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsh w3, [sp, #94] +; NONEON-NOSVE-NEXT: mul w8, w8, w15 +; NONEON-NOSVE-NEXT: ldrsh w6, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w23, [sp, #82] +; NONEON-NOSVE-NEXT: mul w11, w11, w3 +; NONEON-NOSVE-NEXT: ldrsh w25, [sp, #84] +; NONEON-NOSVE-NEXT: mul w13, w13, w23 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #64] +; NONEON-NOSVE-NEXT: mul w14, w14, w25 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: mul w12, w12, w6 +; NONEON-NOSVE-NEXT: lsr w11, w11, #16 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: mul w10, w10, w1 +; NONEON-NOSVE-NEXT: lsr w13, w13, #16 +; NONEON-NOSVE-NEXT: ldrsh w22, [sp, #118] +; NONEON-NOSVE-NEXT: ldrsh w24, [sp, #116] +; NONEON-NOSVE-NEXT: ldrsh w26, [sp, #114] +; NONEON-NOSVE-NEXT: ldrsh w27, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsh w28, [sp, #126] +; NONEON-NOSVE-NEXT: mul w9, w9, w17 +; NONEON-NOSVE-NEXT: mul w21, w21, w22 +; NONEON-NOSVE-NEXT: ldrsh w22, [sp, #86] +; NONEON-NOSVE-NEXT: lsr w14, w14, #16 +; NONEON-NOSVE-NEXT: mul w20, w20, w24 +; NONEON-NOSVE-NEXT: ldrsh w24, [sp, #120] +; NONEON-NOSVE-NEXT: lsr w12, w12, #16 +; NONEON-NOSVE-NEXT: mul w19, w19, w26 +; NONEON-NOSVE-NEXT: ldrsh w26, [sp, #124] +; NONEON-NOSVE-NEXT: lsr w10, w10, #16 +; NONEON-NOSVE-NEXT: mul w7, w7, w27 +; NONEON-NOSVE-NEXT: ldrsh w27, [sp, #122] +; NONEON-NOSVE-NEXT: lsr w21, w21, #16 +; NONEON-NOSVE-NEXT: mul w5, w5, w28 +; NONEON-NOSVE-NEXT: lsr w20, w20, #16 +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: mul w4, w4, w26 +; NONEON-NOSVE-NEXT: lsr w19, w19, #16 +; NONEON-NOSVE-NEXT: strh w21, [sp, #158] +; NONEON-NOSVE-NEXT: mul w2, w2, w27 +; NONEON-NOSVE-NEXT: lsr w7, w7, #16 +; NONEON-NOSVE-NEXT: strh w20, [sp, #156] +; NONEON-NOSVE-NEXT: mul w18, w18, w24 +; NONEON-NOSVE-NEXT: lsr w5, w5, #16 +; NONEON-NOSVE-NEXT: strh w19, [sp, #154] +; NONEON-NOSVE-NEXT: mul w16, w16, w22 +; NONEON-NOSVE-NEXT: lsr w4, w4, #16 +; NONEON-NOSVE-NEXT: strh w7, [sp, #152] +; NONEON-NOSVE-NEXT: lsr w2, w2, #16 +; NONEON-NOSVE-NEXT: strh w5, [sp, #150] +; NONEON-NOSVE-NEXT: lsr w18, w18, #16 +; NONEON-NOSVE-NEXT: strh w4, [sp, #148] +; NONEON-NOSVE-NEXT: lsr w16, w16, #16 +; NONEON-NOSVE-NEXT: strh w2, [sp, #146] +; NONEON-NOSVE-NEXT: strh w18, [sp, #144] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w16, [sp, #142] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #208] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w14, [sp, #140] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w13, [sp, #138] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w12, [sp, #136] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #160] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w11, [sp, #134] +; NONEON-NOSVE-NEXT: strh w10, [sp, #132] +; NONEON-NOSVE-NEXT: strh w9, [sp, #130] +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #240 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -335,8 +926,18 @@ define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smull v0.2d, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: shrn v0.2s, v0.2d, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #8] +; NONEON-NOSVE-NEXT: ldpsw x11, x10, [sp, #16] +; NONEON-NOSVE-NEXT: smull x9, w9, w10 +; NONEON-NOSVE-NEXT: smull x8, w8, w11 +; NONEON-NOSVE-NEXT: lsr x9, x9, #32 +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = sext <2 x i32> %op1 to <2 x i64> %2 = sext <2 x i32> %op2 to <2 x i64> @@ -366,9 +967,28 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smull2 v2.2d, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: smull v0.2d, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #40] +; NONEON-NOSVE-NEXT: ldpsw x10, x11, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldpsw x13, x12, [sp, #48] +; NONEON-NOSVE-NEXT: smull x11, w11, w12 +; NONEON-NOSVE-NEXT: ldpsw x12, x14, [sp, #56] +; NONEON-NOSVE-NEXT: smull x10, w10, w13 +; NONEON-NOSVE-NEXT: lsr x11, x11, #32 +; NONEON-NOSVE-NEXT: smull x9, w9, w14 +; NONEON-NOSVE-NEXT: smull x8, w8, w12 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: lsr x9, x9, #32 +; NONEON-NOSVE-NEXT: stp w10, w11, [sp, #72] +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %1 = sext <4 x i32> %op1 to <4 x i64> %2 = sext <4 x i32> %op2 to <4 x i64> @@ -401,15 +1021,52 @@ define void @smulh_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smulh_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: smull2 v4.2d, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: smull v0.2d, v1.2s, v0.2s -; NONEON-NOSVE-NEXT: smull2 v1.2d, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: smull v2.2d, v2.2s, v3.2s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp2 v1.4s, v2.4s, v1.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldpsw x8, x9, [sp, #56] +; NONEON-NOSVE-NEXT: ldpsw x10, x11, [sp, #48] +; NONEON-NOSVE-NEXT: ldpsw x12, x13, [sp, #104] +; NONEON-NOSVE-NEXT: ldpsw x14, x15, [sp, #96] +; NONEON-NOSVE-NEXT: str q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldpsw x17, x16, [sp, #112] +; NONEON-NOSVE-NEXT: smull x15, w15, w16 +; NONEON-NOSVE-NEXT: ldpsw x16, x18, [sp, #120] +; NONEON-NOSVE-NEXT: smull x14, w14, w17 +; NONEON-NOSVE-NEXT: ldpsw x17, x1, [sp, #80] +; NONEON-NOSVE-NEXT: smull x13, w13, w18 +; NONEON-NOSVE-NEXT: lsr x15, x15, #32 +; NONEON-NOSVE-NEXT: smull x12, w12, w16 +; NONEON-NOSVE-NEXT: lsr x14, x14, #32 +; NONEON-NOSVE-NEXT: ldpsw x16, x18, [sp, #88] +; NONEON-NOSVE-NEXT: smull x11, w11, w1 +; NONEON-NOSVE-NEXT: lsr x13, x13, #32 +; NONEON-NOSVE-NEXT: stp w14, w15, [sp, #152] +; NONEON-NOSVE-NEXT: smull x10, w10, w17 +; NONEON-NOSVE-NEXT: lsr x12, x12, #32 +; NONEON-NOSVE-NEXT: smull x9, w9, w18 +; NONEON-NOSVE-NEXT: smull x8, w8, w16 +; NONEON-NOSVE-NEXT: lsr x11, x11, #32 +; NONEON-NOSVE-NEXT: stp w12, w13, [sp, #144] +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: lsr x9, x9, #32 +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: stp w10, w11, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -442,12 +1099,14 @@ define <1 x i64> @smulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d0 ; NONEON-NOSVE-NEXT: fmov x9, d1 ; NONEON-NOSVE-NEXT: smulh x8, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <1 x i128> undef, i128 64, i128 0 %splat = shufflevector <1 x i128> %insert, <1 x i128> undef, <1 x i32> zeroinitializer @@ -479,15 +1138,17 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: smulh_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: mov x9, v1.d[1] -; NONEON-NOSVE-NEXT: fmov x10, d0 -; NONEON-NOSVE-NEXT: fmov x11, d1 -; NONEON-NOSVE-NEXT: smulh x10, x10, x11 -; NONEON-NOSVE-NEXT: smulh x8, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp, #16] +; NONEON-NOSVE-NEXT: smulh x8, x8, x10 +; NONEON-NOSVE-NEXT: smulh x9, x9, x11 +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %1 = sext <2 x i64> %op1 to <2 x i128> %2 = sext <2 x i64> %op2 to <2 x i128> @@ -520,27 +1181,29 @@ define void @smulh_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: smulh_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x11, v0.d[1] -; NONEON-NOSVE-NEXT: mov x14, v3.d[1] -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: mov x10, v1.d[1] -; NONEON-NOSVE-NEXT: mov x13, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x12, d3 -; NONEON-NOSVE-NEXT: smulh x8, x8, x9 -; NONEON-NOSVE-NEXT: fmov x9, d2 -; NONEON-NOSVE-NEXT: smulh x10, x10, x11 -; NONEON-NOSVE-NEXT: smulh x9, x9, x12 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: smulh x11, x13, x14 -; NONEON-NOSVE-NEXT: fmov d1, x10 -; NONEON-NOSVE-NEXT: fmov d2, x9 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] -; NONEON-NOSVE-NEXT: fmov d3, x11 -; NONEON-NOSVE-NEXT: mov v2.d[1], v3.d[0] -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [sp] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x13, x12, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: smulh x10, x10, x12 +; NONEON-NOSVE-NEXT: ldp x14, x12, [sp, #48] +; NONEON-NOSVE-NEXT: smulh x11, x11, x13 +; NONEON-NOSVE-NEXT: smulh x8, x8, x12 +; NONEON-NOSVE-NEXT: smulh x9, x9, x14 +; NONEON-NOSVE-NEXT: stp x11, x10, [sp, #64] +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -583,11 +1246,31 @@ define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: mul v0.4h, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: ushr v0.4h, v0.4h, #4 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #18] +; NONEON-NOSVE-NEXT: mul w8, w8, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #16] +; NONEON-NOSVE-NEXT: mul w9, w9, w13 +; NONEON-NOSVE-NEXT: mul w10, w10, w14 +; NONEON-NOSVE-NEXT: mul w11, w11, w12 +; NONEON-NOSVE-NEXT: lsr w8, w8, #4 +; NONEON-NOSVE-NEXT: lsr w9, w9, #4 +; NONEON-NOSVE-NEXT: lsr w10, w10, #4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w11, #4 +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = zext <4 x i8> %op1 to <4 x i16> %2 = zext <4 x i8> %op2 to <4 x i16> @@ -617,8 +1300,51 @@ define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umull v0.8h, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: shrn v0.8b, v0.8h, #8 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #22] +; NONEON-NOSVE-NEXT: mul w15, w15, w16 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: mul w14, w14, w17 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #10] +; NONEON-NOSVE-NEXT: mul w13, w13, w16 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #16] +; NONEON-NOSVE-NEXT: mul w12, w12, w18 +; NONEON-NOSVE-NEXT: lsr w15, w15, #8 +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w14, w14, #8 +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #17] +; NONEON-NOSVE-NEXT: mul w8, w8, w17 +; NONEON-NOSVE-NEXT: lsr w13, w13, #8 +; NONEON-NOSVE-NEXT: mul w11, w11, w0 +; NONEON-NOSVE-NEXT: lsr w12, w12, #8 +; NONEON-NOSVE-NEXT: strb w15, [sp, #31] +; NONEON-NOSVE-NEXT: mul w10, w10, w16 +; NONEON-NOSVE-NEXT: strb w14, [sp, #30] +; NONEON-NOSVE-NEXT: mul w9, w9, w18 +; NONEON-NOSVE-NEXT: lsr w8, w8, #8 +; NONEON-NOSVE-NEXT: strb w13, [sp, #29] +; NONEON-NOSVE-NEXT: lsr w11, w11, #8 +; NONEON-NOSVE-NEXT: strb w12, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w10, w10, #8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w9, w9, #8 +; NONEON-NOSVE-NEXT: strb w11, [sp, #27] +; NONEON-NOSVE-NEXT: strb w10, [sp, #26] +; NONEON-NOSVE-NEXT: strb w9, [sp, #25] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = zext <8 x i8> %op1 to <8 x i16> %2 = zext <8 x i8> %op2 to <8 x i16> @@ -648,9 +1374,116 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umull2 v2.8h, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: umull v0.8h, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: uzp2 v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: str x27, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #96] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #112] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #128] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #144] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -80 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d2, d0, [sp] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #49] +; NONEON-NOSVE-NEXT: str d0, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #60] +; NONEON-NOSVE-NEXT: str d1, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #52] +; NONEON-NOSVE-NEXT: mul w20, w20, w21 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #54] +; NONEON-NOSVE-NEXT: mul w19, w19, w23 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #40] +; NONEON-NOSVE-NEXT: mul w7, w7, w25 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #42] +; NONEON-NOSVE-NEXT: mul w6, w6, w26 +; NONEON-NOSVE-NEXT: lsr w20, w20, #8 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #88] +; NONEON-NOSVE-NEXT: lsr w19, w19, #8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #90] +; NONEON-NOSVE-NEXT: lsr w7, w7, #8 +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #92] +; NONEON-NOSVE-NEXT: mul w9, w9, w16 +; NONEON-NOSVE-NEXT: lsr w6, w6, #8 +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #94] +; NONEON-NOSVE-NEXT: mul w11, w11, w1 +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #56] +; NONEON-NOSVE-NEXT: mul w12, w12, w5 +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #58] +; NONEON-NOSVE-NEXT: mul w15, w15, w24 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #57] +; NONEON-NOSVE-NEXT: mul w0, w0, w23 +; NONEON-NOSVE-NEXT: lsr w11, w11, #8 +; NONEON-NOSVE-NEXT: mul w4, w4, w27 +; NONEON-NOSVE-NEXT: lsr w12, w12, #8 +; NONEON-NOSVE-NEXT: lsr w9, w9, #8 +; NONEON-NOSVE-NEXT: mul w3, w3, w25 +; NONEON-NOSVE-NEXT: lsr w15, w15, #8 +; NONEON-NOSVE-NEXT: strb w20, [sp, #79] +; NONEON-NOSVE-NEXT: mul w2, w2, w26 +; NONEON-NOSVE-NEXT: lsr w0, w0, #8 +; NONEON-NOSVE-NEXT: strb w19, [sp, #78] +; NONEON-NOSVE-NEXT: mul w17, w17, w21 +; NONEON-NOSVE-NEXT: lsr w4, w4, #8 +; NONEON-NOSVE-NEXT: strb w7, [sp, #77] +; NONEON-NOSVE-NEXT: mul w13, w13, w22 +; NONEON-NOSVE-NEXT: lsr w3, w3, #8 +; NONEON-NOSVE-NEXT: strb w6, [sp, #76] +; NONEON-NOSVE-NEXT: mul w10, w10, w18 +; NONEON-NOSVE-NEXT: lsr w2, w2, #8 +; NONEON-NOSVE-NEXT: strb w4, [sp, #75] +; NONEON-NOSVE-NEXT: mul w8, w8, w14 +; NONEON-NOSVE-NEXT: lsr w17, w17, #8 +; NONEON-NOSVE-NEXT: strb w3, [sp, #74] +; NONEON-NOSVE-NEXT: lsr w13, w13, #8 +; NONEON-NOSVE-NEXT: strb w2, [sp, #73] +; NONEON-NOSVE-NEXT: ldr x27, [sp, #80] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w10, w10, #8 +; NONEON-NOSVE-NEXT: strb w0, [sp, #72] +; NONEON-NOSVE-NEXT: lsr w8, w8, #8 +; NONEON-NOSVE-NEXT: strb w17, [sp, #71] +; NONEON-NOSVE-NEXT: strb w15, [sp, #70] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #144] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w13, [sp, #69] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #128] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w12, [sp, #68] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #112] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w11, [sp, #67] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #96] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w10, [sp, #66] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %1 = zext <16 x i8> %op1 to <16 x i16> %2 = zext <16 x i8> %op2 to <16 x i16> @@ -683,15 +1516,251 @@ define void @umulh_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umulh_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umull2 v4.8h, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: umull v0.8h, v1.8b, v0.8b -; NONEON-NOSVE-NEXT: umull2 v1.8h, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: umull v2.8h, v2.8b, v3.8b -; NONEON-NOSVE-NEXT: uzp2 v0.16b, v0.16b, v4.16b -; NONEON-NOSVE-NEXT: uzp2 v1.16b, v2.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #384 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #288] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #304] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #320] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #336] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #352] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 384 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: mov x29, x0 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: str q1, [sp, #160] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #128] +; NONEON-NOSVE-NEXT: str q3, [sp, #144] +; NONEON-NOSVE-NEXT: str q2, [sp, #192] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #176] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #184] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #185] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #186] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #187] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #144] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #188] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #189] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #229] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #227] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #228] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #190] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #191] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #192] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #177] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #226] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #214] +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #215] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #178] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #179] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240] +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #212] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #213] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #180] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #181] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #247] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #246] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #244] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #182] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #183] +; NONEON-NOSVE-NEXT: mul w26, w12, w16 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #242] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #250] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #232] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #233] +; NONEON-NOSVE-NEXT: mul w30, w10, w12 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #255] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #253] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #234] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #235] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #248] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #249] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #210] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #236] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #237] +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #211] +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #208] +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #209] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #238] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #239] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #222] +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #223] +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #220] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #224] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #225] +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #221] +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #219] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #230] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #231] +; NONEON-NOSVE-NEXT: mul w27, w8, w14 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #245] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #217] +; NONEON-NOSVE-NEXT: mul w9, w9, w15 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #251] +; NONEON-NOSVE-NEXT: mul w25, w13, w14 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #243] +; NONEON-NOSVE-NEXT: lsr w14, w27, #8 +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #218] +; NONEON-NOSVE-NEXT: lsr w17, w9, #8 +; NONEON-NOSVE-NEXT: mul w28, w11, w13 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #216] +; NONEON-NOSVE-NEXT: strb w14, [sp, #287] +; NONEON-NOSVE-NEXT: lsr w14, w25, #8 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #241] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #240] +; NONEON-NOSVE-NEXT: strb w14, [sp, #285] +; NONEON-NOSVE-NEXT: lsr w14, w28, #8 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #254] +; NONEON-NOSVE-NEXT: mul w8, w25, w8 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #252] +; NONEON-NOSVE-NEXT: strb w14, [sp, #283] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #40] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w9, w25, w9 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w17, [sp, #286] +; NONEON-NOSVE-NEXT: mul w12, w14, w12 +; NONEON-NOSVE-NEXT: lsr w8, w8, #8 +; NONEON-NOSVE-NEXT: lsr w17, w26, #8 +; NONEON-NOSVE-NEXT: mul w10, w25, w10 +; NONEON-NOSVE-NEXT: ldr w25, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldr w14, [sp, #44] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w9, w9, #8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #281] +; NONEON-NOSVE-NEXT: mul w11, w25, w11 +; NONEON-NOSVE-NEXT: strb w17, [sp, #284] +; NONEON-NOSVE-NEXT: lsr w17, w30, #8 +; NONEON-NOSVE-NEXT: mul w13, w14, w13 +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #280] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #320] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: mul w10, w10, w15 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #52] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #279] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #56] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w11, w11, w16 +; NONEON-NOSVE-NEXT: strb w9, [sp, #278] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: mul w12, w12, w18 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #277] +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #64] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #276] +; NONEON-NOSVE-NEXT: mul w13, w13, w0 +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #68] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w10, w10, w1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #275] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: mul w11, w11, w2 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #274] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #76] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #273] +; NONEON-NOSVE-NEXT: mul w12, w12, w3 +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #80] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w13, w13, w4 +; NONEON-NOSVE-NEXT: strb w9, [sp, #272] +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: mul w10, w10, w5 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #271] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #88] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #270] +; NONEON-NOSVE-NEXT: mul w11, w11, w6 +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #92] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w12, w12, w7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #269] +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: mul w13, w13, w19 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #268] +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #267] +; NONEON-NOSVE-NEXT: mul w10, w10, w20 +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w11, w11, w21 +; NONEON-NOSVE-NEXT: strb w9, [sp, #266] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w12, w12, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #265] +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #264] +; NONEON-NOSVE-NEXT: mul w13, w13, w23 +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp w15, w14, [sp, #16] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: mul w10, w10, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #263] +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: mul w11, w11, w27 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #262] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #124] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #261] +; NONEON-NOSVE-NEXT: mul w12, w12, w15 +; NONEON-NOSVE-NEXT: lsr w8, w10, #8 +; NONEON-NOSVE-NEXT: strb w17, [sp, #282] +; NONEON-NOSVE-NEXT: mul w13, w13, w14 +; NONEON-NOSVE-NEXT: strb w9, [sp, #260] +; NONEON-NOSVE-NEXT: lsr w9, w11, #8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #259] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: lsr w8, w12, #8 +; NONEON-NOSVE-NEXT: strb w9, [sp, #258] +; NONEON-NOSVE-NEXT: lsr w9, w13, #8 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #352] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #257] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #336] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #256] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #304] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #256] +; NONEON-NOSVE-NEXT: stp q0, q1, [x29] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #288] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #384 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -730,11 +1799,20 @@ define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: mul v0.2s, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: ushr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #16] +; NONEON-NOSVE-NEXT: mul w8, w8, w10 +; NONEON-NOSVE-NEXT: mul w9, w9, w11 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = zext <2 x i16> %op1 to <2 x i32> %2 = zext <2 x i16> %op2 to <2 x i32> @@ -764,8 +1842,31 @@ define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umull v0.4s, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: shrn v0.4h, v0.4s, #16 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #18] +; NONEON-NOSVE-NEXT: mul w11, w11, w12 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #16] +; NONEON-NOSVE-NEXT: mul w10, w10, w13 +; NONEON-NOSVE-NEXT: mul w9, w9, w14 +; NONEON-NOSVE-NEXT: mul w8, w8, w12 +; NONEON-NOSVE-NEXT: lsr w11, w11, #16 +; NONEON-NOSVE-NEXT: lsr w10, w10, #16 +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: strh w11, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w10, [sp, #28] +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = zext <4 x i16> %op1 to <4 x i32> %2 = zext <4 x i16> %op2 to <4 x i32> @@ -795,9 +1896,54 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umull2 v2.4s, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: umull v0.4s, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #62] +; NONEON-NOSVE-NEXT: mul w15, w15, w16 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #48] +; NONEON-NOSVE-NEXT: mul w14, w14, w17 +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #56] +; NONEON-NOSVE-NEXT: mul w13, w13, w18 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #60] +; NONEON-NOSVE-NEXT: mul w12, w12, w16 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #58] +; NONEON-NOSVE-NEXT: lsr w15, w15, #16 +; NONEON-NOSVE-NEXT: mul w11, w11, w0 +; NONEON-NOSVE-NEXT: lsr w14, w14, #16 +; NONEON-NOSVE-NEXT: mul w10, w10, w18 +; NONEON-NOSVE-NEXT: lsr w13, w13, #16 +; NONEON-NOSVE-NEXT: strh w15, [sp, #78] +; NONEON-NOSVE-NEXT: mul w9, w9, w16 +; NONEON-NOSVE-NEXT: lsr w12, w12, #16 +; NONEON-NOSVE-NEXT: strh w14, [sp, #76] +; NONEON-NOSVE-NEXT: mul w8, w8, w17 +; NONEON-NOSVE-NEXT: lsr w11, w11, #16 +; NONEON-NOSVE-NEXT: strh w13, [sp, #74] +; NONEON-NOSVE-NEXT: lsr w10, w10, #16 +; NONEON-NOSVE-NEXT: strh w12, [sp, #72] +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: strh w11, [sp, #70] +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w10, [sp, #68] +; NONEON-NOSVE-NEXT: strh w9, [sp, #66] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %1 = zext <8 x i16> %op1 to <8 x i32> %2 = zext <8 x i16> %op2 to <8 x i32> @@ -830,15 +1976,125 @@ define void @umulh_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umulh_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umull2 v4.4s, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: umull v0.4s, v1.4h, v0.4h -; NONEON-NOSVE-NEXT: umull2 v1.4s, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: umull v2.4s, v2.4h, v3.4h -; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp2 v1.8h, v2.8h, v1.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #240 +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #160] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #208] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #224] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 240 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: str q3, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #50] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w7, [sp, #96] +; NONEON-NOSVE-NEXT: ldrh w19, [sp, #98] +; NONEON-NOSVE-NEXT: ldrh w20, [sp, #100] +; NONEON-NOSVE-NEXT: ldrh w21, [sp, #102] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #54] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #104] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #106] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #108] +; NONEON-NOSVE-NEXT: ldrh w5, [sp, #110] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w1, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #94] +; NONEON-NOSVE-NEXT: mul w8, w8, w15 +; NONEON-NOSVE-NEXT: ldrh w6, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w23, [sp, #82] +; NONEON-NOSVE-NEXT: mul w11, w11, w3 +; NONEON-NOSVE-NEXT: ldrh w25, [sp, #84] +; NONEON-NOSVE-NEXT: mul w13, w13, w23 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #64] +; NONEON-NOSVE-NEXT: mul w14, w14, w25 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: mul w12, w12, w6 +; NONEON-NOSVE-NEXT: lsr w11, w11, #16 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: mul w10, w10, w1 +; NONEON-NOSVE-NEXT: lsr w13, w13, #16 +; NONEON-NOSVE-NEXT: ldrh w22, [sp, #118] +; NONEON-NOSVE-NEXT: ldrh w24, [sp, #116] +; NONEON-NOSVE-NEXT: ldrh w26, [sp, #114] +; NONEON-NOSVE-NEXT: ldrh w27, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w28, [sp, #126] +; NONEON-NOSVE-NEXT: mul w9, w9, w17 +; NONEON-NOSVE-NEXT: mul w21, w21, w22 +; NONEON-NOSVE-NEXT: ldrh w22, [sp, #86] +; NONEON-NOSVE-NEXT: lsr w14, w14, #16 +; NONEON-NOSVE-NEXT: mul w20, w20, w24 +; NONEON-NOSVE-NEXT: ldrh w24, [sp, #120] +; NONEON-NOSVE-NEXT: lsr w12, w12, #16 +; NONEON-NOSVE-NEXT: mul w19, w19, w26 +; NONEON-NOSVE-NEXT: ldrh w26, [sp, #124] +; NONEON-NOSVE-NEXT: lsr w10, w10, #16 +; NONEON-NOSVE-NEXT: mul w7, w7, w27 +; NONEON-NOSVE-NEXT: ldrh w27, [sp, #122] +; NONEON-NOSVE-NEXT: lsr w21, w21, #16 +; NONEON-NOSVE-NEXT: mul w5, w5, w28 +; NONEON-NOSVE-NEXT: lsr w20, w20, #16 +; NONEON-NOSVE-NEXT: lsr w9, w9, #16 +; NONEON-NOSVE-NEXT: mul w4, w4, w26 +; NONEON-NOSVE-NEXT: lsr w19, w19, #16 +; NONEON-NOSVE-NEXT: strh w21, [sp, #158] +; NONEON-NOSVE-NEXT: mul w2, w2, w27 +; NONEON-NOSVE-NEXT: lsr w7, w7, #16 +; NONEON-NOSVE-NEXT: strh w20, [sp, #156] +; NONEON-NOSVE-NEXT: mul w18, w18, w24 +; NONEON-NOSVE-NEXT: lsr w5, w5, #16 +; NONEON-NOSVE-NEXT: strh w19, [sp, #154] +; NONEON-NOSVE-NEXT: mul w16, w16, w22 +; NONEON-NOSVE-NEXT: lsr w4, w4, #16 +; NONEON-NOSVE-NEXT: strh w7, [sp, #152] +; NONEON-NOSVE-NEXT: lsr w2, w2, #16 +; NONEON-NOSVE-NEXT: strh w5, [sp, #150] +; NONEON-NOSVE-NEXT: lsr w18, w18, #16 +; NONEON-NOSVE-NEXT: strh w4, [sp, #148] +; NONEON-NOSVE-NEXT: lsr w16, w16, #16 +; NONEON-NOSVE-NEXT: strh w2, [sp, #146] +; NONEON-NOSVE-NEXT: strh w18, [sp, #144] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w16, [sp, #142] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #208] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w14, [sp, #140] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w13, [sp, #138] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w12, [sp, #136] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #160] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w11, [sp, #134] +; NONEON-NOSVE-NEXT: strh w10, [sp, #132] +; NONEON-NOSVE-NEXT: strh w9, [sp, #130] +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #240 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -871,8 +2127,18 @@ define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umull v0.2d, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: shrn v0.2s, v0.2d, #32 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp, #16] +; NONEON-NOSVE-NEXT: umull x9, w9, w10 +; NONEON-NOSVE-NEXT: umull x8, w8, w11 +; NONEON-NOSVE-NEXT: lsr x9, x9, #32 +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %1 = zext <2 x i32> %op1 to <2 x i64> %2 = zext <2 x i32> %op2 to <2 x i64> @@ -902,9 +2168,28 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umull2 v2.2d, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: umull v0.2d, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v2.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w13, w12, [sp, #48] +; NONEON-NOSVE-NEXT: umull x11, w11, w12 +; NONEON-NOSVE-NEXT: ldp w12, w14, [sp, #56] +; NONEON-NOSVE-NEXT: umull x10, w10, w13 +; NONEON-NOSVE-NEXT: lsr x11, x11, #32 +; NONEON-NOSVE-NEXT: umull x9, w9, w14 +; NONEON-NOSVE-NEXT: umull x8, w8, w12 +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: lsr x9, x9, #32 +; NONEON-NOSVE-NEXT: stp w10, w11, [sp, #72] +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %1 = zext <4 x i32> %op1 to <4 x i64> %2 = zext <4 x i32> %op2 to <4 x i64> @@ -937,15 +2222,52 @@ define void @umulh_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umulh_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: umull2 v4.2d, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umull v0.2d, v1.2s, v0.2s -; NONEON-NOSVE-NEXT: umull2 v1.2d, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: umull v2.2d, v2.2s, v3.2s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v0.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp2 v1.4s, v2.4s, v1.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: str q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #48] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #96] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #104] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w17, w16, [sp, #112] +; NONEON-NOSVE-NEXT: umull x15, w15, w16 +; NONEON-NOSVE-NEXT: ldp w16, w18, [sp, #120] +; NONEON-NOSVE-NEXT: umull x14, w14, w17 +; NONEON-NOSVE-NEXT: ldp w17, w1, [sp, #80] +; NONEON-NOSVE-NEXT: umull x13, w13, w18 +; NONEON-NOSVE-NEXT: lsr x15, x15, #32 +; NONEON-NOSVE-NEXT: umull x12, w12, w16 +; NONEON-NOSVE-NEXT: lsr x14, x14, #32 +; NONEON-NOSVE-NEXT: ldp w16, w18, [sp, #88] +; NONEON-NOSVE-NEXT: umull x11, w11, w1 +; NONEON-NOSVE-NEXT: lsr x13, x13, #32 +; NONEON-NOSVE-NEXT: stp w14, w15, [sp, #152] +; NONEON-NOSVE-NEXT: umull x10, w10, w17 +; NONEON-NOSVE-NEXT: lsr x12, x12, #32 +; NONEON-NOSVE-NEXT: umull x9, w9, w18 +; NONEON-NOSVE-NEXT: umull x8, w8, w16 +; NONEON-NOSVE-NEXT: lsr x11, x11, #32 +; NONEON-NOSVE-NEXT: stp w12, w13, [sp, #144] +; NONEON-NOSVE-NEXT: lsr x10, x10, #32 +; NONEON-NOSVE-NEXT: lsr x9, x9, #32 +; NONEON-NOSVE-NEXT: lsr x8, x8, #32 +; NONEON-NOSVE-NEXT: stp w10, w11, [sp, #136] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -980,12 +2302,14 @@ define <1 x i64> @umulh_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d0 ; NONEON-NOSVE-NEXT: fmov x9, d1 ; NONEON-NOSVE-NEXT: umulh x8, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %1 = zext <1 x i64> %op1 to <1 x i128> %2 = zext <1 x i64> %op2 to <1 x i128> @@ -1015,15 +2339,17 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: umulh_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: mov x9, v1.d[1] -; NONEON-NOSVE-NEXT: fmov x10, d0 -; NONEON-NOSVE-NEXT: fmov x11, d1 -; NONEON-NOSVE-NEXT: umulh x10, x10, x11 -; NONEON-NOSVE-NEXT: umulh x8, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp, #16] +; NONEON-NOSVE-NEXT: umulh x8, x8, x10 +; NONEON-NOSVE-NEXT: umulh x9, x9, x11 +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %1 = zext <2 x i64> %op1 to <2 x i128> %2 = zext <2 x i64> %op2 to <2 x i128> @@ -1056,27 +2382,29 @@ define void @umulh_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: umulh_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x11, v0.d[1] -; NONEON-NOSVE-NEXT: mov x14, v3.d[1] -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: mov x10, v1.d[1] -; NONEON-NOSVE-NEXT: mov x13, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x12, d3 -; NONEON-NOSVE-NEXT: umulh x8, x8, x9 -; NONEON-NOSVE-NEXT: fmov x9, d2 -; NONEON-NOSVE-NEXT: umulh x10, x10, x11 -; NONEON-NOSVE-NEXT: umulh x9, x9, x12 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: umulh x11, x13, x14 -; NONEON-NOSVE-NEXT: fmov d1, x10 -; NONEON-NOSVE-NEXT: fmov d2, x9 -; NONEON-NOSVE-NEXT: mov v0.d[1], v1.d[0] -; NONEON-NOSVE-NEXT: fmov d3, x11 -; NONEON-NOSVE-NEXT: mov v2.d[1], v3.d[0] -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q2, [sp] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x13, x12, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: umulh x10, x10, x12 +; NONEON-NOSVE-NEXT: ldp x14, x12, [sp, #48] +; NONEON-NOSVE-NEXT: umulh x11, x11, x13 +; NONEON-NOSVE-NEXT: umulh x8, x8, x12 +; NONEON-NOSVE-NEXT: umulh x9, x9, x14 +; NONEON-NOSVE-NEXT: stp x11, x10, [sp, #64] +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll index ad75ba62e17cf8..7bdb4599707b0c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll @@ -21,8 +21,25 @@ define i8 @uaddv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: add w12, w13, w12 +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: add w10, w12, w10 +; NONEON-NOSVE-NEXT: add w8, w8, w14 +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: add w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a) ret i8 %res @@ -40,8 +57,40 @@ define i8 @uaddv_v16i8(<16 x i8> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #2] +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: add w11, w14, w13 +; NONEON-NOSVE-NEXT: add w9, w12, w9 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #7] +; NONEON-NOSVE-NEXT: add w10, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #13] +; NONEON-NOSVE-NEXT: add w9, w9, w16 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #9] +; NONEON-NOSVE-NEXT: add w12, w12, w15 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w13, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #15] +; NONEON-NOSVE-NEXT: add w12, w12, w14 +; NONEON-NOSVE-NEXT: add w8, w8, w11 +; NONEON-NOSVE-NEXT: add w9, w10, w9 +; NONEON-NOSVE-NEXT: add w10, w12, w16 +; NONEON-NOSVE-NEXT: add w8, w8, w15 +; NONEON-NOSVE-NEXT: add w9, w9, w10 +; NONEON-NOSVE-NEXT: add w8, w8, w13 +; NONEON-NOSVE-NEXT: add w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a) ret i8 %res @@ -61,9 +110,72 @@ define i8 @uaddv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: uaddv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: addv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #3] +; NONEON-NOSVE-NEXT: add w9, w11, w10 +; NONEON-NOSVE-NEXT: add w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: add w11, w15, w14 +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #4] +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #6] +; NONEON-NOSVE-NEXT: add w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: add w10, w14, w10 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #7] +; NONEON-NOSVE-NEXT: add w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w14, w13 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #9] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w11, w15, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #26] +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #10] +; NONEON-NOSVE-NEXT: add w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #12] +; NONEON-NOSVE-NEXT: add w9, w9, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #14] +; NONEON-NOSVE-NEXT: add w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #15] +; NONEON-NOSVE-NEXT: add w10, w13, w10 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #13] +; NONEON-NOSVE-NEXT: add w14, w15, w14 +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #30] +; NONEON-NOSVE-NEXT: add w9, w9, w14 +; NONEON-NOSVE-NEXT: add w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #31] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w10, w10, w12 +; NONEON-NOSVE-NEXT: add w11, w16, w11 +; NONEON-NOSVE-NEXT: add w10, w10, w11 +; NONEON-NOSVE-NEXT: add w11, w17, w13 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op) @@ -82,8 +194,17 @@ define i16 @uaddv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: add w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a) ret i16 %res @@ -101,8 +222,24 @@ define i16 @uaddv_v8i16(<8 x i16> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w13, [sp] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: add w12, w13, w12 +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: add w10, w12, w10 +; NONEON-NOSVE-NEXT: add w8, w8, w14 +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: add w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a) ret i16 %res @@ -122,9 +259,40 @@ define i16 @uaddv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: uaddv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: addv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #6] +; NONEON-NOSVE-NEXT: add w9, w11, w10 +; NONEON-NOSVE-NEXT: add w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: add w13, w15, w14 +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #8] +; NONEON-NOSVE-NEXT: add w9, w12, w13 +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #12] +; NONEON-NOSVE-NEXT: add w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #14] +; NONEON-NOSVE-NEXT: add w10, w14, w10 +; NONEON-NOSVE-NEXT: add w11, w15, w11 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w13, w12 +; NONEON-NOSVE-NEXT: add w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op) @@ -143,8 +311,12 @@ define i32 @uaddv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a) ret i32 %res @@ -162,8 +334,13 @@ define i32 @uaddv_v4i32(<4 x i32> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp], #16 +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w0, w10, w8 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) ret i32 %res @@ -183,9 +360,20 @@ define i32 @uaddv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: uaddv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: add v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: addv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: add w9, w11, w9 +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w10, w14, w12 +; NONEON-NOSVE-NEXT: add w11, w15, w13 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op) @@ -203,8 +391,10 @@ define i64 @uaddv_v2i64(<2 x i64> %a) { ; ; NONEON-NOSVE-LABEL: uaddv_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: addp d0, v0.2d -; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: add x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a) ret i64 %res @@ -223,9 +413,13 @@ define i64 @uaddv_v4i64(ptr %a) { ; NONEON-NOSVE-LABEL: uaddv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: add v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: addp d0, v0.2d -; NONEON-NOSVE-NEXT: fmov x0, d0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp], #32 +; NONEON-NOSVE-NEXT: add x8, x10, x8 +; NONEON-NOSVE-NEXT: add x9, x11, x9 +; NONEON-NOSVE-NEXT: add x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op) @@ -247,8 +441,32 @@ define i8 @smaxv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: smaxv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smaxv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a) ret i8 %res @@ -265,8 +483,55 @@ define i8 @smaxv_v16i8(<16 x i8> %a) { ; ; NONEON-NOSVE-LABEL: smaxv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smaxv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a) ret i8 %res @@ -285,9 +550,103 @@ define i8 @smaxv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: smaxv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: smax v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: smaxv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #19] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #21] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op) @@ -305,8 +664,20 @@ define i16 @smaxv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: smaxv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smaxv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a) ret i16 %res @@ -323,8 +694,31 @@ define i16 @smaxv_v8i16(<8 x i16> %a) { ; ; NONEON-NOSVE-LABEL: smaxv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smaxv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a) ret i16 %res @@ -343,9 +737,55 @@ define i16 @smaxv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: smaxv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: smax v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: smaxv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op) @@ -363,8 +803,13 @@ define i32 @smaxv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: smaxv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smaxp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w0, w9, w8, gt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a) ret i32 %res @@ -381,8 +826,17 @@ define i32 @smaxv_v4i32(<4 x i32> %a) { ; ; NONEON-NOSVE-LABEL: smaxv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smaxv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) ret i32 %res @@ -401,9 +855,27 @@ define i32 @smaxv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: smaxv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: smax v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: smaxv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w11, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, gt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, gt +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldp w10, w12, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, gt +; NONEON-NOSVE-NEXT: ldp w11, w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w10, w11 +; NONEON-NOSVE-NEXT: csel w10, w10, w11, gt +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, gt +; NONEON-NOSVE-NEXT: cmp w12, w9 +; NONEON-NOSVE-NEXT: csel w9, w12, w9, gt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, gt +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op) @@ -424,11 +896,9 @@ define i64 @smaxv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmgt d2, d0, d1 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, gt ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a) ret i64 %res @@ -447,15 +917,17 @@ define i64 @smaxv_v4i64(ptr %a) { ; NONEON-NOSVE-LABEL: smaxv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: cmgt v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v2.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmgt d2, d0, d1 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x11, [sp], #32 +; NONEON-NOSVE-NEXT: cmp x8, x9 +; NONEON-NOSVE-NEXT: csel x8, x8, x9, gt +; NONEON-NOSVE-NEXT: cmp x11, x10 +; NONEON-NOSVE-NEXT: csel x9, x11, x10, gt +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, gt ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op) @@ -477,8 +949,32 @@ define i8 @sminv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: sminv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sminv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a) ret i8 %res @@ -495,8 +991,55 @@ define i8 @sminv_v16i8(<16 x i8> %a) { ; ; NONEON-NOSVE-LABEL: sminv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sminv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a) ret i8 %res @@ -515,9 +1058,103 @@ define i8 @sminv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: sminv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: smin v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: sminv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #19] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #21] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsb w10, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op) @@ -535,8 +1172,20 @@ define i16 @sminv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: sminv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sminv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a) ret i16 %res @@ -553,8 +1202,31 @@ define i16 @sminv_v8i16(<8 x i16> %a) { ; ; NONEON-NOSVE-LABEL: sminv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sminv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a) ret i16 %res @@ -573,9 +1245,55 @@ define i16 @sminv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: sminv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: smin v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: sminv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: ldrsh w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op) @@ -593,8 +1311,13 @@ define i32 @sminv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: sminv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sminp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w0, w9, w8, lt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a) ret i32 %res @@ -611,8 +1334,17 @@ define i32 @sminv_v4i32(<4 x i32> %a) { ; ; NONEON-NOSVE-LABEL: sminv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sminv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a) ret i32 %res @@ -631,9 +1363,27 @@ define i32 @sminv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: sminv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: smin v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: sminv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w11, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lt +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lt +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldp w10, w12, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lt +; NONEON-NOSVE-NEXT: ldp w11, w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w10, w11 +; NONEON-NOSVE-NEXT: csel w10, w10, w11, lt +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lt +; NONEON-NOSVE-NEXT: cmp w12, w9 +; NONEON-NOSVE-NEXT: csel w9, w12, w9, lt +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lt +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op) @@ -654,11 +1404,9 @@ define i64 @sminv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmgt d2, d1, d0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, lt ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a) ret i64 %res @@ -676,16 +1424,18 @@ define i64 @sminv_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: sminv_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmgt v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmgt d2, d1, d0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x11, [sp], #32 +; NONEON-NOSVE-NEXT: cmp x8, x9 +; NONEON-NOSVE-NEXT: csel x8, x8, x9, lt +; NONEON-NOSVE-NEXT: cmp x11, x10 +; NONEON-NOSVE-NEXT: csel x9, x11, x10, lt +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, lt ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op) @@ -707,8 +1457,32 @@ define i8 @umaxv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: umaxv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umaxv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a) ret i8 %res @@ -725,8 +1499,55 @@ define i8 @umaxv_v16i8(<16 x i8> %a) { ; ; NONEON-NOSVE-LABEL: umaxv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umaxv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a) ret i8 %res @@ -745,9 +1566,103 @@ define i8 @umaxv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: umaxv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: umax v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: umaxv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op) @@ -765,8 +1680,20 @@ define i16 @umaxv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: umaxv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umaxv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a) ret i16 %res @@ -783,8 +1710,31 @@ define i16 @umaxv_v8i16(<8 x i16> %a) { ; ; NONEON-NOSVE-LABEL: umaxv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umaxv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a) ret i16 %res @@ -803,9 +1753,55 @@ define i16 @umaxv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: umaxv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: umax v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: umaxv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op) @@ -823,8 +1819,13 @@ define i32 @umaxv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: umaxv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umaxp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w0, w9, w8, hi +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a) ret i32 %res @@ -841,8 +1842,17 @@ define i32 @umaxv_v4i32(<4 x i32> %a) { ; ; NONEON-NOSVE-LABEL: umaxv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umaxv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a) ret i32 %res @@ -861,9 +1871,27 @@ define i32 @umaxv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: umaxv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: umax v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: umaxv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w11, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, hi +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldp w10, w12, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: ldp w11, w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w10, w11 +; NONEON-NOSVE-NEXT: csel w10, w10, w11, hi +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi +; NONEON-NOSVE-NEXT: cmp w12, w9 +; NONEON-NOSVE-NEXT: csel w9, w12, w9, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, hi +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op) @@ -884,11 +1912,9 @@ define i64 @umaxv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmhi d2, d0, d1 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, hi ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a) ret i64 %res @@ -907,15 +1933,17 @@ define i64 @umaxv_v4i64(ptr %a) { ; NONEON-NOSVE-LABEL: umaxv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: cmhi v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: bit v0.16b, v1.16b, v2.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmhi d2, d0, d1 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x11, [sp], #32 +; NONEON-NOSVE-NEXT: cmp x8, x9 +; NONEON-NOSVE-NEXT: csel x8, x8, x9, hi +; NONEON-NOSVE-NEXT: cmp x11, x10 +; NONEON-NOSVE-NEXT: csel x9, x11, x10, hi +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, hi ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op) @@ -937,8 +1965,32 @@ define i8 @uminv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: uminv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uminv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a) ret i8 %res @@ -955,8 +2007,55 @@ define i8 @uminv_v16i8(<16 x i8> %a) { ; ; NONEON-NOSVE-LABEL: uminv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uminv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a) ret i8 %res @@ -975,9 +2074,103 @@ define i8 @uminv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: uminv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: umin v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uminv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #2] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #3] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #5] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #7] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #9] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #11] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #13] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #15] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op) @@ -995,8 +2188,20 @@ define i16 @uminv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: uminv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uminv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a) ret i16 %res @@ -1013,8 +2218,31 @@ define i16 @uminv_v8i16(<8 x i16> %a) { ; ; NONEON-NOSVE-LABEL: uminv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uminv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a) ret i16 %res @@ -1033,9 +2261,55 @@ define i16 @uminv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: uminv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: umin v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uminv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #14] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op) @@ -1053,8 +2327,13 @@ define i32 @uminv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: uminv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uminp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w0, w9, w8, lo +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a) ret i32 %res @@ -1071,8 +2350,17 @@ define i32 @uminv_v4i32(<4 x i32> %a) { ; ; NONEON-NOSVE-LABEL: uminv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: uminv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a) ret i32 %res @@ -1091,9 +2379,27 @@ define i32 @uminv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: uminv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: umin v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uminv s0, v0.4s -; NONEON-NOSVE-NEXT: fmov w0, s0 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w11, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w9, w11, w10, lo +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldp w10, w12, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: ldp w11, w9, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w10, w11 +; NONEON-NOSVE-NEXT: csel w10, w10, w11, lo +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo +; NONEON-NOSVE-NEXT: cmp w12, w9 +; NONEON-NOSVE-NEXT: csel w9, w12, w9, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w0, w8, w9, lo +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op) @@ -1114,11 +2420,9 @@ define i64 @uminv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmhi d2, d1, d0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, lo ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a) ret i64 %res @@ -1136,16 +2440,18 @@ define i64 @uminv_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: uminv_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmhi v2.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: cmhi d2, d1, d0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x11, [sp], #32 +; NONEON-NOSVE-NEXT: cmp x8, x9 +; NONEON-NOSVE-NEXT: csel x8, x8, x9, lo +; NONEON-NOSVE-NEXT: cmp x11, x10 +; NONEON-NOSVE-NEXT: csel x9, x11, x10, lo +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x0, x9, x8, lo ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index 99f8aef9f2b22d..cb1fb20ec9d8d7 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -28,31 +28,31 @@ define <4 x i8> @srem_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: shl v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: smov w11, v1.h[0] -; NONEON-NOSVE-NEXT: smov w12, v0.h[0] -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w14, v1.h[2] -; NONEON-NOSVE-NEXT: smov w15, v0.h[2] -; NONEON-NOSVE-NEXT: smov w17, v1.h[3] -; NONEON-NOSVE-NEXT: smov w18, v0.h[3] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w12, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w15, [sp, #10] ; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s0, w11 -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: ldrsb w17, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w18, [sp, #8] +; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v0.h[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w9, w13, w11, w12 +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w0, w18, w17 +; NONEON-NOSVE-NEXT: msub w10, w16, w14, w15 +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = srem <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -86,49 +86,51 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: smov w11, v1.b[0] -; NONEON-NOSVE-NEXT: smov w12, v0.b[0] -; NONEON-NOSVE-NEXT: smov w8, v1.b[1] -; NONEON-NOSVE-NEXT: smov w9, v0.b[1] -; NONEON-NOSVE-NEXT: smov w14, v1.b[2] -; NONEON-NOSVE-NEXT: smov w15, v0.b[2] -; NONEON-NOSVE-NEXT: smov w17, v1.b[3] -; NONEON-NOSVE-NEXT: smov w18, v0.b[3] -; NONEON-NOSVE-NEXT: smov w1, v1.b[4] -; NONEON-NOSVE-NEXT: smov w2, v0.b[4] -; NONEON-NOSVE-NEXT: smov w4, v1.b[5] -; NONEON-NOSVE-NEXT: smov w5, v0.b[5] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] ; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.b[7] -; NONEON-NOSVE-NEXT: fmov s2, w11 -; NONEON-NOSVE-NEXT: smov w11, v0.b[6] -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[6] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: sdiv w0, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: smov w14, v0.b[7] -; NONEON-NOSVE-NEXT: mov v2.b[2], w8 -; NONEON-NOSVE-NEXT: sdiv w3, w2, w1 -; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 -; NONEON-NOSVE-NEXT: mov v2.b[3], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w5, w4 -; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 -; NONEON-NOSVE-NEXT: mov v2.b[4], w8 -; NONEON-NOSVE-NEXT: sdiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w4, w5 -; NONEON-NOSVE-NEXT: mov v2.b[5], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: mov v2.b[6], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 -; NONEON-NOSVE-NEXT: mov v2.b[7], w8 -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = srem <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -182,108 +184,90 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #-80]! // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 -; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 -; NONEON-NOSVE-NEXT: smov w11, v1.b[0] -; NONEON-NOSVE-NEXT: smov w12, v0.b[0] -; NONEON-NOSVE-NEXT: smov w8, v1.b[1] -; NONEON-NOSVE-NEXT: smov w9, v0.b[1] -; NONEON-NOSVE-NEXT: smov w14, v1.b[2] -; NONEON-NOSVE-NEXT: smov w15, v0.b[2] -; NONEON-NOSVE-NEXT: smov w17, v1.b[3] -; NONEON-NOSVE-NEXT: smov w18, v0.b[3] -; NONEON-NOSVE-NEXT: smov w1, v1.b[4] -; NONEON-NOSVE-NEXT: smov w2, v0.b[4] -; NONEON-NOSVE-NEXT: smov w4, v1.b[5] -; NONEON-NOSVE-NEXT: smov w5, v0.b[5] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 -; NONEON-NOSVE-NEXT: smov w7, v1.b[6] -; NONEON-NOSVE-NEXT: smov w19, v0.b[6] -; NONEON-NOSVE-NEXT: smov w21, v1.b[7] -; NONEON-NOSVE-NEXT: smov w22, v0.b[7] -; NONEON-NOSVE-NEXT: smov w24, v1.b[8] -; NONEON-NOSVE-NEXT: smov w25, v0.b[8] -; NONEON-NOSVE-NEXT: smov w27, v1.b[9] -; NONEON-NOSVE-NEXT: smov w28, v0.b[9] -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.b[11] -; NONEON-NOSVE-NEXT: fmov s2, w11 -; NONEON-NOSVE-NEXT: smov w11, v0.b[10] -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.b[10] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: sdiv w0, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: smov w14, v0.b[11] -; NONEON-NOSVE-NEXT: smov w16, v1.b[12] -; NONEON-NOSVE-NEXT: mov v2.b[2], w8 -; NONEON-NOSVE-NEXT: sdiv w3, w2, w1 -; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 -; NONEON-NOSVE-NEXT: smov w17, v0.b[12] -; NONEON-NOSVE-NEXT: smov w0, v1.b[13] -; NONEON-NOSVE-NEXT: mov v2.b[3], w8 -; NONEON-NOSVE-NEXT: sdiv w6, w5, w4 -; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 -; NONEON-NOSVE-NEXT: smov w1, v0.b[13] -; NONEON-NOSVE-NEXT: mov v2.b[4], w8 -; NONEON-NOSVE-NEXT: sdiv w20, w19, w7 -; NONEON-NOSVE-NEXT: msub w8, w6, w4, w5 -; NONEON-NOSVE-NEXT: mov v2.b[5], w8 -; NONEON-NOSVE-NEXT: sdiv w23, w22, w21 -; NONEON-NOSVE-NEXT: msub w8, w20, w7, w19 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v2.b[6], w8 -; NONEON-NOSVE-NEXT: sdiv w26, w25, w24 -; NONEON-NOSVE-NEXT: msub w8, w23, w21, w22 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v2.b[7], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w28, w27 -; NONEON-NOSVE-NEXT: msub w8, w26, w24, w25 -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v2.b[8], w8 -; NONEON-NOSVE-NEXT: sdiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w27, w28 -; NONEON-NOSVE-NEXT: mov v2.b[9], w8 -; NONEON-NOSVE-NEXT: sdiv w15, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: smov w10, v1.b[14] -; NONEON-NOSVE-NEXT: smov w11, v0.b[14] -; NONEON-NOSVE-NEXT: mov v2.b[10], w8 -; NONEON-NOSVE-NEXT: sdiv w18, w17, w16 -; NONEON-NOSVE-NEXT: msub w8, w15, w13, w14 -; NONEON-NOSVE-NEXT: smov w13, v1.b[15] -; NONEON-NOSVE-NEXT: smov w14, v0.b[15] -; NONEON-NOSVE-NEXT: mov v2.b[11], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w1, w0 -; NONEON-NOSVE-NEXT: msub w8, w18, w16, w17 -; NONEON-NOSVE-NEXT: mov v2.b[12], w8 -; NONEON-NOSVE-NEXT: sdiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w0, w1 -; NONEON-NOSVE-NEXT: mov v2.b[13], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: mov v2.b[14], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 -; NONEON-NOSVE-NEXT: mov v2.b[15], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b -; NONEON-NOSVE-NEXT: ldp x28, x27, [sp], #80 // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = srem <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -375,275 +359,175 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: srem_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #320 -; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #224] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #240] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #256] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #272] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #288] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #304] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 320 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 -; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 -; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 -; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: str x0, [sp, #216] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w8, v1.b[1] -; NONEON-NOSVE-NEXT: smov w9, v0.b[1] -; NONEON-NOSVE-NEXT: smov w4, v3.b[1] -; NONEON-NOSVE-NEXT: smov w1, v2.b[1] -; NONEON-NOSVE-NEXT: smov w7, v3.b[7] -; NONEON-NOSVE-NEXT: smov w5, v2.b[7] -; NONEON-NOSVE-NEXT: smov w6, v3.b[8] -; NONEON-NOSVE-NEXT: smov w3, v2.b[8] -; NONEON-NOSVE-NEXT: smov w22, v3.b[9] -; NONEON-NOSVE-NEXT: smov w20, v2.b[9] -; NONEON-NOSVE-NEXT: smov w13, v3.b[0] -; NONEON-NOSVE-NEXT: smov w17, v3.b[3] -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: str w8, [sp, #100] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w8, v1.b[0] -; NONEON-NOSVE-NEXT: str w9, [sp, #108] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w9, v0.b[0] -; NONEON-NOSVE-NEXT: smov w14, v2.b[3] -; NONEON-NOSVE-NEXT: smov w15, v3.b[4] -; NONEON-NOSVE-NEXT: smov w12, v2.b[4] -; NONEON-NOSVE-NEXT: smov w2, v3.b[5] -; NONEON-NOSVE-NEXT: smov w18, v2.b[5] -; NONEON-NOSVE-NEXT: smov w0, v3.b[6] -; NONEON-NOSVE-NEXT: smov w16, v2.b[6] -; NONEON-NOSVE-NEXT: smov w21, v3.b[10] -; NONEON-NOSVE-NEXT: smov w19, v2.b[10] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #36] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: ldr w30, [sp, #36] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: str w10, [sp, #116] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[2] -; NONEON-NOSVE-NEXT: smov w9, v0.b[2] -; NONEON-NOSVE-NEXT: stp w10, w8, [sp, #44] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[3] -; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #52] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w9, v0.b[3] -; NONEON-NOSVE-NEXT: sdiv w26, w14, w17 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w11, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[4] -; NONEON-NOSVE-NEXT: smov w9, v0.b[4] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #60] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[5] -; NONEON-NOSVE-NEXT: smov w9, v0.b[5] -; NONEON-NOSVE-NEXT: str w8, [sp, #96] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #104] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #68] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[6] -; NONEON-NOSVE-NEXT: smov w9, v0.b[6] -; NONEON-NOSVE-NEXT: stp w11, w8, [sp, #80] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #112] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[7] -; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #88] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w9, v0.b[7] -; NONEON-NOSVE-NEXT: sdiv w25, w12, w15 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #132] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[8] -; NONEON-NOSVE-NEXT: smov w9, v0.b[8] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #140] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[9] -; NONEON-NOSVE-NEXT: smov w9, v0.b[9] -; NONEON-NOSVE-NEXT: str w8, [sp, #148] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #156] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w11, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[10] -; NONEON-NOSVE-NEXT: smov w9, v0.b[10] -; NONEON-NOSVE-NEXT: str w10, [sp, #128] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #204] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[11] -; NONEON-NOSVE-NEXT: smov w9, v0.b[11] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #192] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #212] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[12] -; NONEON-NOSVE-NEXT: smov w9, v0.b[12] -; NONEON-NOSVE-NEXT: str w8, [sp, #172] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #180] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #200] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[13] -; NONEON-NOSVE-NEXT: smov w9, v0.b[13] -; NONEON-NOSVE-NEXT: stp w11, w8, [sp, #164] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w11, v3.b[2] -; NONEON-NOSVE-NEXT: str w9, [sp, #176] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #188] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.b[14] -; NONEON-NOSVE-NEXT: smov w9, v0.b[14] -; NONEON-NOSVE-NEXT: str w8, [sp, #144] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #152] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #184] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w9, v2.b[2] -; NONEON-NOSVE-NEXT: sdiv w8, w1, w4 -; NONEON-NOSVE-NEXT: str w10, [sp, #160] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w10, v2.b[0] -; NONEON-NOSVE-NEXT: str w8, [sp, #24] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w8, w5, w7 -; NONEON-NOSVE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w8, w3, w6 -; NONEON-NOSVE-NEXT: str w8, [sp, #20] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w8, w20, w22 -; NONEON-NOSVE-NEXT: sdiv w24, w10, w13 -; NONEON-NOSVE-NEXT: str w8, [sp, #32] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: ldp w29, w8, [sp, #40] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w8, w30, w29 -; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #224] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: fmov s4, w8 -; NONEON-NOSVE-NEXT: sdiv w23, w9, w11 -; NONEON-NOSVE-NEXT: msub w10, w24, w13, w10 -; NONEON-NOSVE-NEXT: ldr w13, [sp, #24] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w24, [sp, #100] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w13, w13, w4, w1 -; NONEON-NOSVE-NEXT: ldr w1, [sp, #116] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w4, [sp, #108] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: fmov s5, w10 -; NONEON-NOSVE-NEXT: msub w1, w1, w24, w4 -; NONEON-NOSVE-NEXT: mov v5.b[1], w13 -; NONEON-NOSVE-NEXT: mov v4.b[1], w1 -; NONEON-NOSVE-NEXT: ldr w1, [sp, #120] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w23, w11, w9 -; NONEON-NOSVE-NEXT: ldr w11, [sp, #48] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w28, w18, w2 -; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #52] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #272] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w26, w17, w14 -; NONEON-NOSVE-NEXT: ldr w14, [sp, #72] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w11, w10 -; NONEON-NOSVE-NEXT: ldr w17, [sp, #96] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: smov w10, v3.b[11] -; NONEON-NOSVE-NEXT: smov w11, v2.b[11] -; NONEON-NOSVE-NEXT: mov v4.b[2], w9 -; NONEON-NOSVE-NEXT: mov v5.b[3], w8 -; NONEON-NOSVE-NEXT: msub w8, w25, w15, w12 -; NONEON-NOSVE-NEXT: ldp w13, w9, [sp, #76] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w27, w16, w0 -; NONEON-NOSVE-NEXT: ldr w15, [sp, #104] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #256] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w14, w13 -; NONEON-NOSVE-NEXT: ldr w14, [sp, #60] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[4], w8 -; NONEON-NOSVE-NEXT: msub w8, w28, w2, w18 -; NONEON-NOSVE-NEXT: ldr w2, [sp, #156] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[3], w9 -; NONEON-NOSVE-NEXT: ldp w12, w9, [sp, #64] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[5], w8 -; NONEON-NOSVE-NEXT: msub w8, w27, w0, w16 -; NONEON-NOSVE-NEXT: ldr w0, [sp, #132] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w4, w19, w21 -; NONEON-NOSVE-NEXT: msub w9, w9, w14, w12 -; NONEON-NOSVE-NEXT: smov w12, v3.b[12] -; NONEON-NOSVE-NEXT: smov w14, v2.b[12] -; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #240] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[6], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[4], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #112] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w8, w7, w5 -; NONEON-NOSVE-NEXT: ldr w5, [sp, #204] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w17, w15 -; NONEON-NOSVE-NEXT: ldr w17, [sp, #84] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[7], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w13, w11, w10 -; NONEON-NOSVE-NEXT: mov v4.b[5], w9 -; NONEON-NOSVE-NEXT: ldp w16, w9, [sp, #88] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w8, w6, w3 -; NONEON-NOSVE-NEXT: ldr w3, [sp, #148] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w17, w16 -; NONEON-NOSVE-NEXT: smov w16, v3.b[13] -; NONEON-NOSVE-NEXT: smov w17, v2.b[13] -; NONEON-NOSVE-NEXT: mov v5.b[8], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[6], w9 -; NONEON-NOSVE-NEXT: msub w8, w8, w22, w20 -; NONEON-NOSVE-NEXT: sdiv w15, w14, w12 -; NONEON-NOSVE-NEXT: ldp w18, w9, [sp, #136] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[9], w8 -; NONEON-NOSVE-NEXT: msub w8, w4, w21, w19 -; NONEON-NOSVE-NEXT: msub w9, w9, w0, w18 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #304] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #288] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[7], w9 -; NONEON-NOSVE-NEXT: mov v5.b[10], w8 -; NONEON-NOSVE-NEXT: msub w8, w13, w10, w11 -; NONEON-NOSVE-NEXT: ldp w0, w9, [sp, #124] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp w11, w10, [sp, #196] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w13, [sp, #192] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w18, w17, w16 -; NONEON-NOSVE-NEXT: msub w9, w9, w1, w0 -; NONEON-NOSVE-NEXT: mov v5.b[11], w8 -; NONEON-NOSVE-NEXT: smov w0, v3.b[14] -; NONEON-NOSVE-NEXT: msub w10, w10, w13, w11 -; NONEON-NOSVE-NEXT: smov w1, v2.b[14] -; NONEON-NOSVE-NEXT: msub w8, w15, w12, w14 -; NONEON-NOSVE-NEXT: mov v4.b[8], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #164] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp w15, w13, [sp, #168] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w3, w2 -; NONEON-NOSVE-NEXT: mov v5.b[12], w8 -; NONEON-NOSVE-NEXT: ldp w4, w3, [sp, #208] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp w14, w12, [sp, #176] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[9], w9 -; NONEON-NOSVE-NEXT: sdiv w2, w1, w0 -; NONEON-NOSVE-NEXT: smov w9, v3.b[15] -; NONEON-NOSVE-NEXT: msub w3, w3, w5, w4 -; NONEON-NOSVE-NEXT: smov w4, v2.b[15] -; NONEON-NOSVE-NEXT: msub w8, w18, w16, w17 -; NONEON-NOSVE-NEXT: ldr w16, [sp, #144] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[10], w3 -; NONEON-NOSVE-NEXT: mov v5.b[13], w8 -; NONEON-NOSVE-NEXT: mov v4.b[11], w10 -; NONEON-NOSVE-NEXT: ldr w10, [sp, #188] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w11, w4, w9 -; NONEON-NOSVE-NEXT: msub w8, w2, w0, w1 -; NONEON-NOSVE-NEXT: msub w10, w10, w13, w12 -; NONEON-NOSVE-NEXT: smov w12, v1.b[15] -; NONEON-NOSVE-NEXT: smov w13, v0.b[15] -; NONEON-NOSVE-NEXT: mov v5.b[14], w8 -; NONEON-NOSVE-NEXT: mov v4.b[12], w10 -; NONEON-NOSVE-NEXT: ldr w10, [sp, #184] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w10, w10, w15, w14 -; NONEON-NOSVE-NEXT: ldr w15, [sp, #152] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w14, w13, w12 -; NONEON-NOSVE-NEXT: msub w8, w11, w9, w4 -; NONEON-NOSVE-NEXT: mov v4.b[13], w10 -; NONEON-NOSVE-NEXT: ldr w10, [sp, #160] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[15], w8 -; NONEON-NOSVE-NEXT: ldr x8, [sp, #216] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w10, w10, w16, w15 -; NONEON-NOSVE-NEXT: mov v4.b[14], w10 -; NONEON-NOSVE-NEXT: msub w9, w14, w12, w13 -; NONEON-NOSVE-NEXT: mov v4.b[15], w9 -; NONEON-NOSVE-NEXT: stp q5, q4, [x8] -; NONEON-NOSVE-NEXT: add sp, sp, #320 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #47] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #62] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #61] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #60] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #59] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #58] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #57] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #55] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #54] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #53] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #52] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #51] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #50] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #49] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -669,29 +553,31 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: smov w11, v1.h[0] -; NONEON-NOSVE-NEXT: smov w12, v0.h[0] -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w14, v1.h[2] -; NONEON-NOSVE-NEXT: smov w15, v0.h[2] -; NONEON-NOSVE-NEXT: smov w17, v1.h[3] -; NONEON-NOSVE-NEXT: smov w18, v0.h[3] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s0, w11 -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v0.h[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = srem <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -724,47 +610,50 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: smov w11, v1.h[0] -; NONEON-NOSVE-NEXT: smov w12, v0.h[0] -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w14, v1.h[2] -; NONEON-NOSVE-NEXT: smov w15, v0.h[2] -; NONEON-NOSVE-NEXT: smov w17, v1.h[3] -; NONEON-NOSVE-NEXT: smov w18, v0.h[3] -; NONEON-NOSVE-NEXT: smov w1, v1.h[4] -; NONEON-NOSVE-NEXT: smov w2, v0.h[4] -; NONEON-NOSVE-NEXT: smov w4, v1.h[5] -; NONEON-NOSVE-NEXT: smov w5, v0.h[5] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] ; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: smov w13, v1.h[7] -; NONEON-NOSVE-NEXT: fmov s2, w11 -; NONEON-NOSVE-NEXT: smov w11, v0.h[6] -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: smov w10, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 -; NONEON-NOSVE-NEXT: sdiv w0, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: smov w14, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: sdiv w3, w2, w1 -; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w5, w4 -; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: sdiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w4, w5 -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = srem <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -813,135 +702,95 @@ define void @srem_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: srem_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #144 -; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #64] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #80] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #96] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #112] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #128] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 -; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 -; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 -; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: smov w8, v1.h[1] -; NONEON-NOSVE-NEXT: smov w9, v0.h[1] -; NONEON-NOSVE-NEXT: smov w20, v1.h[0] -; NONEON-NOSVE-NEXT: smov w21, v0.h[0] -; NONEON-NOSVE-NEXT: smov w19, v0.h[3] -; NONEON-NOSVE-NEXT: smov w5, v1.h[4] -; NONEON-NOSVE-NEXT: smov w2, v0.h[4] -; NONEON-NOSVE-NEXT: smov w1, v3.h[1] -; NONEON-NOSVE-NEXT: smov w23, v2.h[1] -; NONEON-NOSVE-NEXT: smov w25, v3.h[0] -; NONEON-NOSVE-NEXT: smov w26, v2.h[0] -; NONEON-NOSVE-NEXT: smov w6, v1.h[5] -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #36] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w8, v1.h[2] -; NONEON-NOSVE-NEXT: smov w9, v0.h[2] -; NONEON-NOSVE-NEXT: smov w3, v0.h[5] -; NONEON-NOSVE-NEXT: smov w4, v1.h[6] -; NONEON-NOSVE-NEXT: smov w7, v0.h[6] -; NONEON-NOSVE-NEXT: smov w28, v3.h[2] -; NONEON-NOSVE-NEXT: smov w29, v2.h[2] -; NONEON-NOSVE-NEXT: smov w15, v3.h[3] -; NONEON-NOSVE-NEXT: smov w13, v2.h[3] -; NONEON-NOSVE-NEXT: smov w12, v3.h[4] -; NONEON-NOSVE-NEXT: smov w14, v3.h[5] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w11, w21, w20 -; NONEON-NOSVE-NEXT: str w10, [sp, #44] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: smov w8, v1.h[3] -; NONEON-NOSVE-NEXT: stp w8, w11, [sp] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w11, v2.h[4] -; NONEON-NOSVE-NEXT: ldr w22, [sp, #4] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w20, w22, w20, w21 -; NONEON-NOSVE-NEXT: sdiv w9, w19, w8 -; NONEON-NOSVE-NEXT: str w10, [sp, #32] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w10, v3.h[6] -; NONEON-NOSVE-NEXT: fmov s5, w20 -; NONEON-NOSVE-NEXT: smov w20, v3.h[7] -; NONEON-NOSVE-NEXT: sdiv w8, w2, w5 -; NONEON-NOSVE-NEXT: sdiv w24, w23, w1 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: sdiv w27, w26, w25 -; NONEON-NOSVE-NEXT: msub w1, w24, w1, w23 -; NONEON-NOSVE-NEXT: ldp w24, w23, [sp, #40] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w9, w3, w6 -; NONEON-NOSVE-NEXT: msub w21, w27, w25, w26 -; NONEON-NOSVE-NEXT: ldr w25, [sp, #36] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w23, w23, w25, w24 -; NONEON-NOSVE-NEXT: ldr w25, [sp, #24] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: fmov s4, w21 -; NONEON-NOSVE-NEXT: mov v5.h[1], w23 -; NONEON-NOSVE-NEXT: ldp w23, w21, [sp, #28] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.h[1], w1 -; NONEON-NOSVE-NEXT: sdiv w8, w7, w4 -; NONEON-NOSVE-NEXT: msub w21, w21, w25, w23 -; NONEON-NOSVE-NEXT: smov w23, v2.h[7] -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #80] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.h[2], w21 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #112] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: sdiv w30, w29, w28 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: smov w9, v2.h[5] -; NONEON-NOSVE-NEXT: smov w8, v2.h[6] -; NONEON-NOSVE-NEXT: sdiv w18, w13, w15 -; NONEON-NOSVE-NEXT: msub w1, w30, w28, w29 -; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.h[2], w1 -; NONEON-NOSVE-NEXT: sdiv w16, w11, w12 -; NONEON-NOSVE-NEXT: msub w13, w18, w15, w13 -; NONEON-NOSVE-NEXT: ldr w15, [sp, #20] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w18, [sp] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w15, w15, w18, w19 -; NONEON-NOSVE-NEXT: mov v4.h[3], w13 -; NONEON-NOSVE-NEXT: smov w13, v1.h[7] -; NONEON-NOSVE-NEXT: mov v5.h[3], w15 -; NONEON-NOSVE-NEXT: smov w15, v0.h[7] -; NONEON-NOSVE-NEXT: sdiv w17, w9, w14 -; NONEON-NOSVE-NEXT: msub w11, w16, w12, w11 -; NONEON-NOSVE-NEXT: ldr w12, [sp, #16] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w12, w12, w5, w2 -; NONEON-NOSVE-NEXT: mov v4.h[4], w11 -; NONEON-NOSVE-NEXT: ldr w11, [sp, #12] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.h[4], w12 -; NONEON-NOSVE-NEXT: msub w11, w11, w6, w3 -; NONEON-NOSVE-NEXT: sdiv w24, w8, w10 -; NONEON-NOSVE-NEXT: msub w9, w17, w14, w9 -; NONEON-NOSVE-NEXT: mov v5.h[5], w11 -; NONEON-NOSVE-NEXT: mov v4.h[5], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w4, w7 -; NONEON-NOSVE-NEXT: sdiv w18, w23, w20 -; NONEON-NOSVE-NEXT: msub w8, w24, w10, w8 -; NONEON-NOSVE-NEXT: mov v5.h[6], w9 -; NONEON-NOSVE-NEXT: mov v4.h[6], w8 -; NONEON-NOSVE-NEXT: sdiv w12, w15, w13 -; NONEON-NOSVE-NEXT: msub w8, w18, w20, w23 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #128] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #96] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.h[7], w8 -; NONEON-NOSVE-NEXT: msub w9, w12, w13, w15 -; NONEON-NOSVE-NEXT: mov v5.h[7], w9 -; NONEON-NOSVE-NEXT: stp q4, q5, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #144 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #58] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #54] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #50] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -964,19 +813,20 @@ define <2 x i32> @srem_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: mov w11, v1.s[1] -; NONEON-NOSVE-NEXT: mov w12, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: msub w9, w13, w11, w12 -; NONEON-NOSVE-NEXT: mov v0.s[1], w9 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = srem <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -996,26 +846,28 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov w11, s1 -; NONEON-NOSVE-NEXT: fmov w12, s0 -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: mov w14, v1.s[2] -; NONEON-NOSVE-NEXT: mov w15, v0.s[2] -; NONEON-NOSVE-NEXT: mov w17, v1.s[3] -; NONEON-NOSVE-NEXT: mov w18, v0.s[3] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] ; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s0, w11 -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v0.s[1], w8 -; NONEON-NOSVE-NEXT: sdiv w9, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v0.s[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = srem <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -1039,61 +891,50 @@ define void @srem_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: srem_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str x23, [sp, #-48]! // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -48 -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov w12, s0 -; NONEON-NOSVE-NEXT: fmov w3, s2 -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w11, s1 -; NONEON-NOSVE-NEXT: fmov w2, s3 -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w17, v3.s[1] -; NONEON-NOSVE-NEXT: mov w18, v2.s[1] -; NONEON-NOSVE-NEXT: mov w14, v1.s[2] -; NONEON-NOSVE-NEXT: mov w15, v0.s[2] -; NONEON-NOSVE-NEXT: mov w5, v3.s[2] -; NONEON-NOSVE-NEXT: mov w6, v2.s[2] -; NONEON-NOSVE-NEXT: sdiv w13, w12, w11 -; NONEON-NOSVE-NEXT: mov w19, v3.s[3] -; NONEON-NOSVE-NEXT: mov w20, v2.s[3] -; NONEON-NOSVE-NEXT: mov w22, v1.s[3] -; NONEON-NOSVE-NEXT: mov w23, v0.s[3] -; NONEON-NOSVE-NEXT: sdiv w4, w3, w2 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s1, w11 -; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w12, w4, w2, w3 -; NONEON-NOSVE-NEXT: fmov s0, w12 -; NONEON-NOSVE-NEXT: sdiv w1, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v1.s[1], w8 -; NONEON-NOSVE-NEXT: sdiv w16, w15, w14 -; NONEON-NOSVE-NEXT: msub w13, w1, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.s[1], w13 -; NONEON-NOSVE-NEXT: sdiv w7, w6, w5 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v1.s[2], w8 -; NONEON-NOSVE-NEXT: sdiv w21, w20, w19 -; NONEON-NOSVE-NEXT: msub w10, w7, w5, w6 -; NONEON-NOSVE-NEXT: mov v0.s[2], w10 -; NONEON-NOSVE-NEXT: sdiv w9, w23, w22 -; NONEON-NOSVE-NEXT: msub w10, w21, w19, w20 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v0.s[3], w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w22, w23 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #36] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #4] +; NONEON-NOSVE-NEXT: sdiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: sdiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ldr x23, [sp], #48 // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -1116,13 +957,15 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d1 ; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: sdiv x10, x9, x8 ; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = srem <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -1142,16 +985,19 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: srem_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x11, v1.d[1] -; NONEON-NOSVE-NEXT: mov x12, v0.d[1] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x11, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: sdiv x10, x11, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x11 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv x10, x9, x8 -; NONEON-NOSVE-NEXT: sdiv x13, x12, x11 ; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: msub x9, x13, x11, x12 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = srem <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -1175,29 +1021,33 @@ define void @srem_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: srem_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: fmov x15, d2 -; NONEON-NOSVE-NEXT: mov x12, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: fmov x14, d3 -; NONEON-NOSVE-NEXT: mov x11, v3.d[1] -; NONEON-NOSVE-NEXT: mov x17, v1.d[1] -; NONEON-NOSVE-NEXT: mov x18, v0.d[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: sdiv x10, x11, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x11 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: sdiv x10, x9, x8 +; NONEON-NOSVE-NEXT: msub x11, x10, x8, x9 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: sdiv x10, x9, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: sdiv x10, x9, x8 -; NONEON-NOSVE-NEXT: sdiv x16, x15, x14 ; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: sdiv x13, x12, x11 -; NONEON-NOSVE-NEXT: msub x10, x16, x14, x15 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: sdiv x1, x18, x17 -; NONEON-NOSVE-NEXT: msub x9, x13, x11, x12 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 -; NONEON-NOSVE-NEXT: msub x11, x1, x17, x18 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -1229,37 +1079,31 @@ define <4 x i8> @urem_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w11, v1.h[0] -; NONEON-NOSVE-NEXT: umov w12, v0.h[0] -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w14, v1.h[2] -; NONEON-NOSVE-NEXT: umov w15, v0.h[2] -; NONEON-NOSVE-NEXT: umov w17, v1.h[3] -; NONEON-NOSVE-NEXT: umov w18, v0.h[3] -; NONEON-NOSVE-NEXT: and w11, w11, #0xff -; NONEON-NOSVE-NEXT: and w12, w12, #0xff -; NONEON-NOSVE-NEXT: and w8, w8, #0xff +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #10] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #8] ; NONEON-NOSVE-NEXT: udiv w13, w12, w11 -; NONEON-NOSVE-NEXT: and w9, w9, #0xff -; NONEON-NOSVE-NEXT: and w14, w14, #0xff -; NONEON-NOSVE-NEXT: and w15, w15, #0xff -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: and w12, w17, #0xff -; NONEON-NOSVE-NEXT: and w13, w18, #0xff -; NONEON-NOSVE-NEXT: fmov s0, w11 -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: udiv w9, w13, w12 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v0.h[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w12, w13 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w16, w15, w14 +; NONEON-NOSVE-NEXT: msub w9, w13, w11, w12 +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w0, w18, w17 +; NONEON-NOSVE-NEXT: msub w10, w16, w14, w15 +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = urem <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -1293,49 +1137,51 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w11, v1.b[0] -; NONEON-NOSVE-NEXT: umov w12, v0.b[0] -; NONEON-NOSVE-NEXT: umov w8, v1.b[1] -; NONEON-NOSVE-NEXT: umov w9, v0.b[1] -; NONEON-NOSVE-NEXT: umov w14, v1.b[2] -; NONEON-NOSVE-NEXT: umov w15, v0.b[2] -; NONEON-NOSVE-NEXT: umov w17, v1.b[3] -; NONEON-NOSVE-NEXT: umov w18, v0.b[3] -; NONEON-NOSVE-NEXT: umov w1, v1.b[4] -; NONEON-NOSVE-NEXT: umov w2, v0.b[4] -; NONEON-NOSVE-NEXT: umov w4, v1.b[5] -; NONEON-NOSVE-NEXT: umov w5, v0.b[5] -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] ; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.b[7] -; NONEON-NOSVE-NEXT: fmov s2, w11 -; NONEON-NOSVE-NEXT: umov w11, v0.b[6] -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[6] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: udiv w0, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: umov w14, v0.b[7] -; NONEON-NOSVE-NEXT: mov v2.b[2], w8 -; NONEON-NOSVE-NEXT: udiv w3, w2, w1 -; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 -; NONEON-NOSVE-NEXT: mov v2.b[3], w8 -; NONEON-NOSVE-NEXT: udiv w9, w5, w4 -; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 -; NONEON-NOSVE-NEXT: mov v2.b[4], w8 -; NONEON-NOSVE-NEXT: udiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w4, w5 -; NONEON-NOSVE-NEXT: mov v2.b[5], w8 -; NONEON-NOSVE-NEXT: udiv w9, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: mov v2.b[6], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 -; NONEON-NOSVE-NEXT: mov v2.b[7], w8 -; NONEON-NOSVE-NEXT: fmov d0, d2 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = urem <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -1389,108 +1235,90 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #-80]! // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 -; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 -; NONEON-NOSVE-NEXT: umov w11, v1.b[0] -; NONEON-NOSVE-NEXT: umov w12, v0.b[0] -; NONEON-NOSVE-NEXT: umov w8, v1.b[1] -; NONEON-NOSVE-NEXT: umov w9, v0.b[1] -; NONEON-NOSVE-NEXT: umov w14, v1.b[2] -; NONEON-NOSVE-NEXT: umov w15, v0.b[2] -; NONEON-NOSVE-NEXT: umov w17, v1.b[3] -; NONEON-NOSVE-NEXT: umov w18, v0.b[3] -; NONEON-NOSVE-NEXT: umov w1, v1.b[4] -; NONEON-NOSVE-NEXT: umov w2, v0.b[4] -; NONEON-NOSVE-NEXT: umov w4, v1.b[5] -; NONEON-NOSVE-NEXT: umov w5, v0.b[5] -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 -; NONEON-NOSVE-NEXT: umov w7, v1.b[6] -; NONEON-NOSVE-NEXT: umov w19, v0.b[6] -; NONEON-NOSVE-NEXT: umov w21, v1.b[7] -; NONEON-NOSVE-NEXT: umov w22, v0.b[7] -; NONEON-NOSVE-NEXT: umov w24, v1.b[8] -; NONEON-NOSVE-NEXT: umov w25, v0.b[8] -; NONEON-NOSVE-NEXT: umov w27, v1.b[9] -; NONEON-NOSVE-NEXT: umov w28, v0.b[9] -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.b[11] -; NONEON-NOSVE-NEXT: fmov s2, w11 -; NONEON-NOSVE-NEXT: umov w11, v0.b[10] -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.b[10] -; NONEON-NOSVE-NEXT: mov v2.b[1], w8 -; NONEON-NOSVE-NEXT: udiv w0, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: umov w14, v0.b[11] -; NONEON-NOSVE-NEXT: umov w16, v1.b[12] -; NONEON-NOSVE-NEXT: mov v2.b[2], w8 -; NONEON-NOSVE-NEXT: udiv w3, w2, w1 -; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 -; NONEON-NOSVE-NEXT: umov w17, v0.b[12] -; NONEON-NOSVE-NEXT: umov w0, v1.b[13] -; NONEON-NOSVE-NEXT: mov v2.b[3], w8 -; NONEON-NOSVE-NEXT: udiv w6, w5, w4 -; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 -; NONEON-NOSVE-NEXT: umov w1, v0.b[13] -; NONEON-NOSVE-NEXT: mov v2.b[4], w8 -; NONEON-NOSVE-NEXT: udiv w20, w19, w7 -; NONEON-NOSVE-NEXT: msub w8, w6, w4, w5 -; NONEON-NOSVE-NEXT: mov v2.b[5], w8 -; NONEON-NOSVE-NEXT: udiv w23, w22, w21 -; NONEON-NOSVE-NEXT: msub w8, w20, w7, w19 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v2.b[6], w8 -; NONEON-NOSVE-NEXT: udiv w26, w25, w24 -; NONEON-NOSVE-NEXT: msub w8, w23, w21, w22 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v2.b[7], w8 -; NONEON-NOSVE-NEXT: udiv w9, w28, w27 -; NONEON-NOSVE-NEXT: msub w8, w26, w24, w25 -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v2.b[8], w8 -; NONEON-NOSVE-NEXT: udiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w27, w28 -; NONEON-NOSVE-NEXT: mov v2.b[9], w8 -; NONEON-NOSVE-NEXT: udiv w15, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: umov w10, v1.b[14] -; NONEON-NOSVE-NEXT: umov w11, v0.b[14] -; NONEON-NOSVE-NEXT: mov v2.b[10], w8 -; NONEON-NOSVE-NEXT: udiv w18, w17, w16 -; NONEON-NOSVE-NEXT: msub w8, w15, w13, w14 -; NONEON-NOSVE-NEXT: umov w13, v1.b[15] -; NONEON-NOSVE-NEXT: umov w14, v0.b[15] -; NONEON-NOSVE-NEXT: mov v2.b[11], w8 -; NONEON-NOSVE-NEXT: udiv w9, w1, w0 -; NONEON-NOSVE-NEXT: msub w8, w18, w16, w17 -; NONEON-NOSVE-NEXT: mov v2.b[12], w8 -; NONEON-NOSVE-NEXT: udiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w0, w1 -; NONEON-NOSVE-NEXT: mov v2.b[13], w8 -; NONEON-NOSVE-NEXT: udiv w9, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: mov v2.b[14], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 -; NONEON-NOSVE-NEXT: mov v2.b[15], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b -; NONEON-NOSVE-NEXT: ldp x28, x27, [sp], #80 // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = urem <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -1582,275 +1410,175 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: urem_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #320 -; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #224] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #240] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #256] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #272] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #288] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #304] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 320 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 -; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 -; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 -; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: str x0, [sp, #216] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w8, v1.b[1] -; NONEON-NOSVE-NEXT: umov w9, v0.b[1] -; NONEON-NOSVE-NEXT: umov w4, v3.b[1] -; NONEON-NOSVE-NEXT: umov w1, v2.b[1] -; NONEON-NOSVE-NEXT: umov w7, v3.b[7] -; NONEON-NOSVE-NEXT: umov w5, v2.b[7] -; NONEON-NOSVE-NEXT: umov w6, v3.b[8] -; NONEON-NOSVE-NEXT: umov w3, v2.b[8] -; NONEON-NOSVE-NEXT: umov w22, v3.b[9] -; NONEON-NOSVE-NEXT: umov w20, v2.b[9] -; NONEON-NOSVE-NEXT: umov w13, v3.b[0] -; NONEON-NOSVE-NEXT: umov w17, v3.b[3] -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: str w8, [sp, #100] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w8, v1.b[0] -; NONEON-NOSVE-NEXT: str w9, [sp, #108] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w9, v0.b[0] -; NONEON-NOSVE-NEXT: umov w14, v2.b[3] -; NONEON-NOSVE-NEXT: umov w15, v3.b[4] -; NONEON-NOSVE-NEXT: umov w12, v2.b[4] -; NONEON-NOSVE-NEXT: umov w2, v3.b[5] -; NONEON-NOSVE-NEXT: umov w18, v2.b[5] -; NONEON-NOSVE-NEXT: umov w0, v3.b[6] -; NONEON-NOSVE-NEXT: umov w16, v2.b[6] -; NONEON-NOSVE-NEXT: umov w21, v3.b[10] -; NONEON-NOSVE-NEXT: umov w19, v2.b[10] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #36] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: ldr w30, [sp, #36] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: str w10, [sp, #116] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[2] -; NONEON-NOSVE-NEXT: umov w9, v0.b[2] -; NONEON-NOSVE-NEXT: stp w10, w8, [sp, #44] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[3] -; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #52] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w9, v0.b[3] -; NONEON-NOSVE-NEXT: udiv w26, w14, w17 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w11, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[4] -; NONEON-NOSVE-NEXT: umov w9, v0.b[4] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #60] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[5] -; NONEON-NOSVE-NEXT: umov w9, v0.b[5] -; NONEON-NOSVE-NEXT: str w8, [sp, #96] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #104] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #68] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[6] -; NONEON-NOSVE-NEXT: umov w9, v0.b[6] -; NONEON-NOSVE-NEXT: stp w11, w8, [sp, #80] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #112] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[7] -; NONEON-NOSVE-NEXT: stp w9, w10, [sp, #88] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w9, v0.b[7] -; NONEON-NOSVE-NEXT: udiv w25, w12, w15 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #132] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[8] -; NONEON-NOSVE-NEXT: umov w9, v0.b[8] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #140] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[9] -; NONEON-NOSVE-NEXT: umov w9, v0.b[9] -; NONEON-NOSVE-NEXT: str w8, [sp, #148] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #156] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w11, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[10] -; NONEON-NOSVE-NEXT: umov w9, v0.b[10] -; NONEON-NOSVE-NEXT: str w10, [sp, #128] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #204] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[11] -; NONEON-NOSVE-NEXT: umov w9, v0.b[11] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #192] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #212] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[12] -; NONEON-NOSVE-NEXT: umov w9, v0.b[12] -; NONEON-NOSVE-NEXT: str w8, [sp, #172] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #180] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #200] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[13] -; NONEON-NOSVE-NEXT: umov w9, v0.b[13] -; NONEON-NOSVE-NEXT: stp w11, w8, [sp, #164] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w11, v3.b[2] -; NONEON-NOSVE-NEXT: str w9, [sp, #176] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #188] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.b[14] -; NONEON-NOSVE-NEXT: umov w9, v0.b[14] -; NONEON-NOSVE-NEXT: str w8, [sp, #144] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w9, [sp, #152] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: str w10, [sp, #184] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w9, v2.b[2] -; NONEON-NOSVE-NEXT: udiv w8, w1, w4 -; NONEON-NOSVE-NEXT: str w10, [sp, #160] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w10, v2.b[0] -; NONEON-NOSVE-NEXT: str w8, [sp, #24] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w8, w5, w7 -; NONEON-NOSVE-NEXT: str w8, [sp, #28] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w8, w3, w6 -; NONEON-NOSVE-NEXT: str w8, [sp, #20] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w8, w20, w22 -; NONEON-NOSVE-NEXT: udiv w24, w10, w13 -; NONEON-NOSVE-NEXT: str w8, [sp, #32] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: ldp w29, w8, [sp, #40] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w8, w30, w29 -; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #224] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: fmov s4, w8 -; NONEON-NOSVE-NEXT: udiv w23, w9, w11 -; NONEON-NOSVE-NEXT: msub w10, w24, w13, w10 -; NONEON-NOSVE-NEXT: ldr w13, [sp, #24] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w24, [sp, #100] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w13, w13, w4, w1 -; NONEON-NOSVE-NEXT: ldr w1, [sp, #116] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w4, [sp, #108] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: fmov s5, w10 -; NONEON-NOSVE-NEXT: msub w1, w1, w24, w4 -; NONEON-NOSVE-NEXT: mov v5.b[1], w13 -; NONEON-NOSVE-NEXT: mov v4.b[1], w1 -; NONEON-NOSVE-NEXT: ldr w1, [sp, #120] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w23, w11, w9 -; NONEON-NOSVE-NEXT: ldr w11, [sp, #48] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w28, w18, w2 -; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #52] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #272] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w26, w17, w14 -; NONEON-NOSVE-NEXT: ldr w14, [sp, #72] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w11, w10 -; NONEON-NOSVE-NEXT: ldr w17, [sp, #96] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: umov w10, v3.b[11] -; NONEON-NOSVE-NEXT: umov w11, v2.b[11] -; NONEON-NOSVE-NEXT: mov v4.b[2], w9 -; NONEON-NOSVE-NEXT: mov v5.b[3], w8 -; NONEON-NOSVE-NEXT: msub w8, w25, w15, w12 -; NONEON-NOSVE-NEXT: ldp w13, w9, [sp, #76] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w27, w16, w0 -; NONEON-NOSVE-NEXT: ldr w15, [sp, #104] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #256] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w14, w13 -; NONEON-NOSVE-NEXT: ldr w14, [sp, #60] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[4], w8 -; NONEON-NOSVE-NEXT: msub w8, w28, w2, w18 -; NONEON-NOSVE-NEXT: ldr w2, [sp, #156] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[3], w9 -; NONEON-NOSVE-NEXT: ldp w12, w9, [sp, #64] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[5], w8 -; NONEON-NOSVE-NEXT: msub w8, w27, w0, w16 -; NONEON-NOSVE-NEXT: ldr w0, [sp, #132] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w4, w19, w21 -; NONEON-NOSVE-NEXT: msub w9, w9, w14, w12 -; NONEON-NOSVE-NEXT: umov w12, v3.b[12] -; NONEON-NOSVE-NEXT: umov w14, v2.b[12] -; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #240] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[6], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[4], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #112] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w8, w7, w5 -; NONEON-NOSVE-NEXT: ldr w5, [sp, #204] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w17, w15 -; NONEON-NOSVE-NEXT: ldr w17, [sp, #84] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[7], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w13, w11, w10 -; NONEON-NOSVE-NEXT: mov v4.b[5], w9 -; NONEON-NOSVE-NEXT: ldp w16, w9, [sp, #88] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w8, w8, w6, w3 -; NONEON-NOSVE-NEXT: ldr w3, [sp, #148] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w17, w16 -; NONEON-NOSVE-NEXT: umov w16, v3.b[13] -; NONEON-NOSVE-NEXT: umov w17, v2.b[13] -; NONEON-NOSVE-NEXT: mov v5.b[8], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[6], w9 -; NONEON-NOSVE-NEXT: msub w8, w8, w22, w20 -; NONEON-NOSVE-NEXT: udiv w15, w14, w12 -; NONEON-NOSVE-NEXT: ldp w18, w9, [sp, #136] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[9], w8 -; NONEON-NOSVE-NEXT: msub w8, w4, w21, w19 -; NONEON-NOSVE-NEXT: msub w9, w9, w0, w18 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #304] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #288] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[7], w9 -; NONEON-NOSVE-NEXT: mov v5.b[10], w8 -; NONEON-NOSVE-NEXT: msub w8, w13, w10, w11 -; NONEON-NOSVE-NEXT: ldp w0, w9, [sp, #124] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp w11, w10, [sp, #196] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w13, [sp, #192] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w18, w17, w16 -; NONEON-NOSVE-NEXT: msub w9, w9, w1, w0 -; NONEON-NOSVE-NEXT: mov v5.b[11], w8 -; NONEON-NOSVE-NEXT: umov w0, v3.b[14] -; NONEON-NOSVE-NEXT: msub w10, w10, w13, w11 -; NONEON-NOSVE-NEXT: umov w1, v2.b[14] -; NONEON-NOSVE-NEXT: msub w8, w15, w12, w14 -; NONEON-NOSVE-NEXT: mov v4.b[8], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #164] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp w15, w13, [sp, #168] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w3, w2 -; NONEON-NOSVE-NEXT: mov v5.b[12], w8 -; NONEON-NOSVE-NEXT: ldp w4, w3, [sp, #208] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp w14, w12, [sp, #176] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[9], w9 -; NONEON-NOSVE-NEXT: udiv w2, w1, w0 -; NONEON-NOSVE-NEXT: umov w9, v3.b[15] -; NONEON-NOSVE-NEXT: msub w3, w3, w5, w4 -; NONEON-NOSVE-NEXT: umov w4, v2.b[15] -; NONEON-NOSVE-NEXT: msub w8, w18, w16, w17 -; NONEON-NOSVE-NEXT: ldr w16, [sp, #144] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.b[10], w3 -; NONEON-NOSVE-NEXT: mov v5.b[13], w8 -; NONEON-NOSVE-NEXT: mov v4.b[11], w10 -; NONEON-NOSVE-NEXT: ldr w10, [sp, #188] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w11, w4, w9 -; NONEON-NOSVE-NEXT: msub w8, w2, w0, w1 -; NONEON-NOSVE-NEXT: msub w10, w10, w13, w12 -; NONEON-NOSVE-NEXT: umov w12, v1.b[15] -; NONEON-NOSVE-NEXT: umov w13, v0.b[15] -; NONEON-NOSVE-NEXT: mov v5.b[14], w8 -; NONEON-NOSVE-NEXT: mov v4.b[12], w10 -; NONEON-NOSVE-NEXT: ldr w10, [sp, #184] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w10, w10, w15, w14 -; NONEON-NOSVE-NEXT: ldr w15, [sp, #152] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w14, w13, w12 -; NONEON-NOSVE-NEXT: msub w8, w11, w9, w4 -; NONEON-NOSVE-NEXT: mov v4.b[13], w10 -; NONEON-NOSVE-NEXT: ldr w10, [sp, #160] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.b[15], w8 -; NONEON-NOSVE-NEXT: ldr x8, [sp, #216] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w10, w10, w16, w15 -; NONEON-NOSVE-NEXT: mov v4.b[14], w10 -; NONEON-NOSVE-NEXT: msub w9, w14, w12, w13 -; NONEON-NOSVE-NEXT: mov v4.b[15], w9 -; NONEON-NOSVE-NEXT: stp q5, q4, [x8] -; NONEON-NOSVE-NEXT: add sp, sp, #320 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -1876,29 +1604,31 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w11, v1.h[0] -; NONEON-NOSVE-NEXT: umov w12, v0.h[0] -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w14, v1.h[2] -; NONEON-NOSVE-NEXT: umov w15, v0.h[2] -; NONEON-NOSVE-NEXT: umov w17, v1.h[3] -; NONEON-NOSVE-NEXT: umov w18, v0.h[3] -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] ; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s0, w11 -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v0.h[1], w8 -; NONEON-NOSVE-NEXT: udiv w9, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v0.h[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.h[3], w8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = urem <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -1931,47 +1661,50 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: umov w11, v1.h[0] -; NONEON-NOSVE-NEXT: umov w12, v0.h[0] -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w14, v1.h[2] -; NONEON-NOSVE-NEXT: umov w15, v0.h[2] -; NONEON-NOSVE-NEXT: umov w17, v1.h[3] -; NONEON-NOSVE-NEXT: umov w18, v0.h[3] -; NONEON-NOSVE-NEXT: umov w1, v1.h[4] -; NONEON-NOSVE-NEXT: umov w2, v0.h[4] -; NONEON-NOSVE-NEXT: umov w4, v1.h[5] -; NONEON-NOSVE-NEXT: umov w5, v0.h[5] -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] ; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: umov w13, v1.h[7] -; NONEON-NOSVE-NEXT: fmov s2, w11 -; NONEON-NOSVE-NEXT: umov w11, v0.h[6] -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: umov w10, v1.h[6] -; NONEON-NOSVE-NEXT: mov v2.h[1], w8 -; NONEON-NOSVE-NEXT: udiv w0, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: umov w14, v0.h[7] -; NONEON-NOSVE-NEXT: mov v2.h[2], w8 -; NONEON-NOSVE-NEXT: udiv w3, w2, w1 -; NONEON-NOSVE-NEXT: msub w8, w0, w17, w18 -; NONEON-NOSVE-NEXT: mov v2.h[3], w8 -; NONEON-NOSVE-NEXT: udiv w9, w5, w4 -; NONEON-NOSVE-NEXT: msub w8, w3, w1, w2 -; NONEON-NOSVE-NEXT: mov v2.h[4], w8 -; NONEON-NOSVE-NEXT: udiv w12, w11, w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w4, w5 -; NONEON-NOSVE-NEXT: mov v2.h[5], w8 -; NONEON-NOSVE-NEXT: udiv w9, w14, w13 -; NONEON-NOSVE-NEXT: msub w8, w12, w10, w11 -; NONEON-NOSVE-NEXT: mov v2.h[6], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w13, w14 -; NONEON-NOSVE-NEXT: mov v2.h[7], w8 -; NONEON-NOSVE-NEXT: mov v0.16b, v2.16b +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = urem <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -2020,135 +1753,95 @@ define void @urem_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: urem_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #144 -; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #48] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #64] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #80] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #96] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #112] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #128] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 -; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 -; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 -; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 -; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 -; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 -; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 -; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ldr q2, [x0] -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: umov w8, v1.h[1] -; NONEON-NOSVE-NEXT: umov w9, v0.h[1] -; NONEON-NOSVE-NEXT: umov w20, v1.h[0] -; NONEON-NOSVE-NEXT: umov w21, v0.h[0] -; NONEON-NOSVE-NEXT: umov w19, v0.h[3] -; NONEON-NOSVE-NEXT: umov w5, v1.h[4] -; NONEON-NOSVE-NEXT: umov w2, v0.h[4] -; NONEON-NOSVE-NEXT: umov w1, v3.h[1] -; NONEON-NOSVE-NEXT: umov w23, v2.h[1] -; NONEON-NOSVE-NEXT: umov w25, v3.h[0] -; NONEON-NOSVE-NEXT: umov w26, v2.h[0] -; NONEON-NOSVE-NEXT: umov w6, v1.h[5] -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #36] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w8, v1.h[2] -; NONEON-NOSVE-NEXT: umov w9, v0.h[2] -; NONEON-NOSVE-NEXT: umov w3, v0.h[5] -; NONEON-NOSVE-NEXT: umov w4, v1.h[6] -; NONEON-NOSVE-NEXT: umov w7, v0.h[6] -; NONEON-NOSVE-NEXT: umov w28, v3.h[2] -; NONEON-NOSVE-NEXT: umov w29, v2.h[2] -; NONEON-NOSVE-NEXT: umov w15, v3.h[3] -; NONEON-NOSVE-NEXT: umov w13, v2.h[3] -; NONEON-NOSVE-NEXT: umov w12, v3.h[4] -; NONEON-NOSVE-NEXT: umov w14, v3.h[5] -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w11, w21, w20 -; NONEON-NOSVE-NEXT: str w10, [sp, #44] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: umov w8, v1.h[3] -; NONEON-NOSVE-NEXT: stp w8, w11, [sp] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w11, v2.h[4] -; NONEON-NOSVE-NEXT: ldr w22, [sp, #4] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w20, w22, w20, w21 -; NONEON-NOSVE-NEXT: udiv w9, w19, w8 -; NONEON-NOSVE-NEXT: str w10, [sp, #32] // 4-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w10, v3.h[6] -; NONEON-NOSVE-NEXT: fmov s5, w20 -; NONEON-NOSVE-NEXT: umov w20, v3.h[7] -; NONEON-NOSVE-NEXT: udiv w8, w2, w5 -; NONEON-NOSVE-NEXT: udiv w24, w23, w1 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: udiv w27, w26, w25 -; NONEON-NOSVE-NEXT: msub w1, w24, w1, w23 -; NONEON-NOSVE-NEXT: ldp w24, w23, [sp, #40] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w9, w3, w6 -; NONEON-NOSVE-NEXT: msub w21, w27, w25, w26 -; NONEON-NOSVE-NEXT: ldr w25, [sp, #36] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w23, w23, w25, w24 -; NONEON-NOSVE-NEXT: ldr w25, [sp, #24] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: fmov s4, w21 -; NONEON-NOSVE-NEXT: mov v5.h[1], w23 -; NONEON-NOSVE-NEXT: ldp w23, w21, [sp, #28] // 8-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.h[1], w1 -; NONEON-NOSVE-NEXT: udiv w8, w7, w4 -; NONEON-NOSVE-NEXT: msub w21, w21, w25, w23 -; NONEON-NOSVE-NEXT: umov w23, v2.h[7] -; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #80] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.h[2], w21 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #112] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: udiv w30, w29, w28 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: umov w9, v2.h[5] -; NONEON-NOSVE-NEXT: umov w8, v2.h[6] -; NONEON-NOSVE-NEXT: udiv w18, w13, w15 -; NONEON-NOSVE-NEXT: msub w1, w30, w28, w29 -; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #48] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.h[2], w1 -; NONEON-NOSVE-NEXT: udiv w16, w11, w12 -; NONEON-NOSVE-NEXT: msub w13, w18, w15, w13 -; NONEON-NOSVE-NEXT: ldr w15, [sp, #20] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: ldr w18, [sp] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w15, w15, w18, w19 -; NONEON-NOSVE-NEXT: mov v4.h[3], w13 -; NONEON-NOSVE-NEXT: umov w13, v1.h[7] -; NONEON-NOSVE-NEXT: mov v5.h[3], w15 -; NONEON-NOSVE-NEXT: umov w15, v0.h[7] -; NONEON-NOSVE-NEXT: udiv w17, w9, w14 -; NONEON-NOSVE-NEXT: msub w11, w16, w12, w11 -; NONEON-NOSVE-NEXT: ldr w12, [sp, #16] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w12, w12, w5, w2 -; NONEON-NOSVE-NEXT: mov v4.h[4], w11 -; NONEON-NOSVE-NEXT: ldr w11, [sp, #12] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v5.h[4], w12 -; NONEON-NOSVE-NEXT: msub w11, w11, w6, w3 -; NONEON-NOSVE-NEXT: udiv w24, w8, w10 -; NONEON-NOSVE-NEXT: msub w9, w17, w14, w9 -; NONEON-NOSVE-NEXT: mov v5.h[5], w11 -; NONEON-NOSVE-NEXT: mov v4.h[5], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload -; NONEON-NOSVE-NEXT: msub w9, w9, w4, w7 -; NONEON-NOSVE-NEXT: udiv w18, w23, w20 -; NONEON-NOSVE-NEXT: msub w8, w24, w10, w8 -; NONEON-NOSVE-NEXT: mov v5.h[6], w9 -; NONEON-NOSVE-NEXT: mov v4.h[6], w8 -; NONEON-NOSVE-NEXT: udiv w12, w15, w13 -; NONEON-NOSVE-NEXT: msub w8, w18, w20, w23 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #128] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #96] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v4.h[7], w8 -; NONEON-NOSVE-NEXT: msub w9, w12, w13, w15 -; NONEON-NOSVE-NEXT: mov v5.h[7], w9 -; NONEON-NOSVE-NEXT: stp q4, q5, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #144 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -2171,19 +1864,20 @@ define <2 x i32> @urem_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: mov w11, v1.s[1] -; NONEON-NOSVE-NEXT: mov w12, v0.s[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] ; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: msub w9, w13, w11, w12 -; NONEON-NOSVE-NEXT: mov v0.s[1], w9 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = urem <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -2203,26 +1897,28 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov w11, s1 -; NONEON-NOSVE-NEXT: fmov w12, s0 -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: mov w14, v1.s[2] -; NONEON-NOSVE-NEXT: mov w15, v0.s[2] -; NONEON-NOSVE-NEXT: mov w17, v1.s[3] -; NONEON-NOSVE-NEXT: mov w18, v0.s[3] -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: udiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] ; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s0, w11 -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 ; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v0.s[1], w8 -; NONEON-NOSVE-NEXT: udiv w9, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v0.s[2], w8 -; NONEON-NOSVE-NEXT: msub w8, w9, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.s[3], w8 +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = urem <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -2246,61 +1942,50 @@ define void @urem_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: urem_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str x23, [sp, #-48]! // 8-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 -; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 -; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 -; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 -; NONEON-NOSVE-NEXT: .cfi_offset w23, -48 -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov w12, s0 -; NONEON-NOSVE-NEXT: fmov w3, s2 -; NONEON-NOSVE-NEXT: mov w9, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w11, s1 -; NONEON-NOSVE-NEXT: fmov w2, s3 -; NONEON-NOSVE-NEXT: mov w8, v1.s[1] -; NONEON-NOSVE-NEXT: mov w17, v3.s[1] -; NONEON-NOSVE-NEXT: mov w18, v2.s[1] -; NONEON-NOSVE-NEXT: mov w14, v1.s[2] -; NONEON-NOSVE-NEXT: mov w15, v0.s[2] -; NONEON-NOSVE-NEXT: mov w5, v3.s[2] -; NONEON-NOSVE-NEXT: mov w6, v2.s[2] -; NONEON-NOSVE-NEXT: udiv w13, w12, w11 -; NONEON-NOSVE-NEXT: mov w19, v3.s[3] -; NONEON-NOSVE-NEXT: mov w20, v2.s[3] -; NONEON-NOSVE-NEXT: mov w22, v1.s[3] -; NONEON-NOSVE-NEXT: mov w23, v0.s[3] -; NONEON-NOSVE-NEXT: udiv w4, w3, w2 -; NONEON-NOSVE-NEXT: msub w11, w13, w11, w12 -; NONEON-NOSVE-NEXT: fmov s1, w11 -; NONEON-NOSVE-NEXT: udiv w10, w9, w8 -; NONEON-NOSVE-NEXT: msub w12, w4, w2, w3 -; NONEON-NOSVE-NEXT: fmov s0, w12 -; NONEON-NOSVE-NEXT: udiv w1, w18, w17 -; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 -; NONEON-NOSVE-NEXT: mov v1.s[1], w8 -; NONEON-NOSVE-NEXT: udiv w16, w15, w14 -; NONEON-NOSVE-NEXT: msub w13, w1, w17, w18 -; NONEON-NOSVE-NEXT: mov v0.s[1], w13 -; NONEON-NOSVE-NEXT: udiv w7, w6, w5 -; NONEON-NOSVE-NEXT: msub w8, w16, w14, w15 -; NONEON-NOSVE-NEXT: mov v1.s[2], w8 -; NONEON-NOSVE-NEXT: udiv w21, w20, w19 -; NONEON-NOSVE-NEXT: msub w10, w7, w5, w6 -; NONEON-NOSVE-NEXT: mov v0.s[2], w10 -; NONEON-NOSVE-NEXT: udiv w9, w23, w22 -; NONEON-NOSVE-NEXT: msub w10, w21, w19, w20 -; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v0.s[3], w10 -; NONEON-NOSVE-NEXT: msub w8, w9, w22, w23 -; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: mov v1.s[3], w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: udiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #36] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #76] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w9, w11, [sp, #4] +; NONEON-NOSVE-NEXT: udiv w10, w11, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w11 +; NONEON-NOSVE-NEXT: str w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w11, w10, w8, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: udiv w10, w9, w8 +; NONEON-NOSVE-NEXT: msub w8, w10, w8, w9 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ldr x23, [sp], #48 // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -2323,13 +2008,15 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d1 killed $d1 def $q1 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: fmov x8, d1 ; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: udiv x10, x9, x8 ; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = urem <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -2349,16 +2036,19 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: urem_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: mov x11, v1.d[1] -; NONEON-NOSVE-NEXT: mov x12, v0.d[1] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x11, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: udiv x10, x11, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x11 +; NONEON-NOSVE-NEXT: str x8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: udiv x10, x9, x8 -; NONEON-NOSVE-NEXT: udiv x13, x12, x11 ; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 -; NONEON-NOSVE-NEXT: fmov d0, x8 -; NONEON-NOSVE-NEXT: msub x9, x13, x11, x12 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = urem <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -2382,29 +2072,33 @@ define void @urem_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: urem_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q2, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q1, [x1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: fmov x15, d2 -; NONEON-NOSVE-NEXT: mov x12, v2.d[1] -; NONEON-NOSVE-NEXT: fmov x8, d1 -; NONEON-NOSVE-NEXT: fmov x14, d3 -; NONEON-NOSVE-NEXT: mov x11, v3.d[1] -; NONEON-NOSVE-NEXT: mov x17, v1.d[1] -; NONEON-NOSVE-NEXT: mov x18, v0.d[1] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: udiv x10, x11, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x11 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: udiv x10, x9, x8 +; NONEON-NOSVE-NEXT: msub x11, x10, x8, x9 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: udiv x10, x9, x8 +; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] ; NONEON-NOSVE-NEXT: udiv x10, x9, x8 -; NONEON-NOSVE-NEXT: udiv x16, x15, x14 ; NONEON-NOSVE-NEXT: msub x8, x10, x8, x9 -; NONEON-NOSVE-NEXT: fmov d1, x8 -; NONEON-NOSVE-NEXT: udiv x13, x12, x11 -; NONEON-NOSVE-NEXT: msub x10, x16, x14, x15 -; NONEON-NOSVE-NEXT: fmov d0, x10 -; NONEON-NOSVE-NEXT: udiv x1, x18, x17 -; NONEON-NOSVE-NEXT: msub x9, x13, x11, x12 -; NONEON-NOSVE-NEXT: mov v0.d[1], x9 -; NONEON-NOSVE-NEXT: msub x11, x1, x17, x18 -; NONEON-NOSVE-NEXT: mov v1.d[1], x11 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll index 0108fb580b947b..5cee1360f6f3cf 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll @@ -20,10 +20,28 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.4h, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x i8> %op1, <4 x i8> %op2 ret <4 x i8> %sel @@ -43,10 +61,44 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v8i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.8b, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2 ret <8 x i8> %sel @@ -66,10 +118,75 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v16i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.16b, w8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2 ret <16 x i8> %sel @@ -92,16 +209,147 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v32i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.16b, w8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #63] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #62] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #61] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #60] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #59] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #58] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #57] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #56] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #55] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #54] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #53] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #52] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #51] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #50] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #49] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <32 x i8>, ptr %a %op2 = load volatile <32 x i8>, ptr %b @@ -125,10 +373,18 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v2i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.2s, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w11, w10, w8, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x i16> %op1, <2 x i16> %op2 ret <2 x i16> %sel @@ -149,10 +405,28 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.4h, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2 ret <4 x i16> %sel @@ -173,10 +447,43 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v8i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.8h, w8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2 ret <8 x i16> %sel @@ -200,16 +507,83 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.8h, w8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #62] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #60] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #58] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #54] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #50] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <16 x i16>, ptr %a %op2 = load volatile <16 x i16>, ptr %b @@ -233,10 +607,18 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v2i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.2s, w8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w11, w10, w8, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2 ret <2 x i32> %sel @@ -257,10 +639,23 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: dup v2.4s, w8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: csel w11, w10, w8, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w11, w10, w8, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, ne +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2 ret <4 x i32> %sel @@ -284,16 +679,43 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm w8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.4s, w8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: csel w11, w8, w10, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: csel w11, w8, w10, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: csel w11, w8, w10, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: csel w11, w8, w10, ne +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: csel w8, w8, w9, ne +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <8 x i32>, ptr %a %op2 = load volatile <8 x i32>, ptr %b @@ -318,10 +740,15 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: fmov d2, x8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: csel x8, x9, x8, ne +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2 ret <1 x i64> %sel @@ -343,10 +770,17 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v2i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: dup v2.2d, x8 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: csel x11, x10, x8, ne +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csel x8, x9, x8, ne +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2 ret <2 x i64> %sel @@ -371,16 +805,31 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: ldr q1, [x0, #16] ; NONEON-NOSVE-NEXT: tst w2, #0x1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: ldr q3, [x1] -; NONEON-NOSVE-NEXT: ldr q4, [x1, #16] -; NONEON-NOSVE-NEXT: dup v0.2d, x8 -; NONEON-NOSVE-NEXT: bif v1.16b, v3.16b, v0.16b -; NONEON-NOSVE-NEXT: bsl v0.16b, v2.16b, v4.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1] +; NONEON-NOSVE-NEXT: ldr q3, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: csel x11, x8, x10, ne +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csel x8, x8, x9, ne +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #48] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: csel x11, x8, x10, ne +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: csel x8, x8, x9, ne +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load volatile <4 x i64>, ptr %a %op2 = load volatile <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll index f7198e3042ad53..2778e93416a748 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll @@ -23,12 +23,27 @@ define <4 x i8> @ashr_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: neg v1.4h, v1.4h -; NONEON-NOSVE-NEXT: sshl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w11, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w13, [sp, #10] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsb w14, [sp, #8] +; NONEON-NOSVE-NEXT: asr w10, w11, w10 +; NONEON-NOSVE-NEXT: asr w11, w13, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: asr w8, w14, w9 +; NONEON-NOSVE-NEXT: strh w10, [sp, #28] +; NONEON-NOSVE-NEXT: strh w11, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = ashr <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -46,8 +61,43 @@ define <8 x i8> @ashr_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.8b, v1.8b -; NONEON-NOSVE-NEXT: sshl v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = ashr <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -65,8 +115,74 @@ define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.16b, v1.16b -; NONEON-NOSVE-NEXT: sshl v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = ashr <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -86,13 +202,143 @@ define void @ashr_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ashr_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.16b, v0.16b -; NONEON-NOSVE-NEXT: neg v1.16b, v1.16b -; NONEON-NOSVE-NEXT: sshl v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: sshl v1.16b, v3.16b, v1.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #47] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -115,12 +361,18 @@ define <2 x i16> @ashr_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: neg v1.2s, v1.2s -; NONEON-NOSVE-NEXT: sshl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w11, [sp, #8] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: asr w9, w11, w10 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = ashr <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -138,8 +390,27 @@ define <4 x i16> @ashr_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.4h, v1.4h -; NONEON-NOSVE-NEXT: sshl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = ashr <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -157,8 +428,42 @@ define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.8h, v1.8h -; NONEON-NOSVE-NEXT: sshl v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = ashr <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -178,13 +483,79 @@ define void @ashr_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ashr_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.8h, v0.8h -; NONEON-NOSVE-NEXT: neg v1.8h, v1.8h -; NONEON-NOSVE-NEXT: sshl v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: sshl v1.8h, v3.8h, v1.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -205,8 +576,17 @@ define <2 x i32> @ashr_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.2s, v1.2s -; NONEON-NOSVE-NEXT: sshl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = ashr <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -224,8 +604,22 @@ define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s -; NONEON-NOSVE-NEXT: sshl v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = ashr <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -245,13 +639,39 @@ define void @ashr_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ashr_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.4s, v0.4s -; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s -; NONEON-NOSVE-NEXT: sshl v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: sshl v1.4s, v3.4s, v1.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -272,8 +692,14 @@ define <1 x i64> @ashr_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg d1, d1 -; NONEON-NOSVE-NEXT: sshl d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: asr x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = ashr <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -291,8 +717,16 @@ define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: ashr_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.2d, v1.2d -; NONEON-NOSVE-NEXT: sshl v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: asr x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: asr x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = ashr <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -312,13 +746,27 @@ define void @ashr_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ashr_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: neg v1.2d, v1.2d -; NONEON-NOSVE-NEXT: sshl v0.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: sshl v1.2d, v3.2d, v1.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: asr x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: asr x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: asr x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: asr x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -345,11 +793,27 @@ define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b -; NONEON-NOSVE-NEXT: neg v1.4h, v1.4h -; NONEON-NOSVE-NEXT: ushl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #10] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #8] +; NONEON-NOSVE-NEXT: lsr w10, w11, w10 +; NONEON-NOSVE-NEXT: lsr w11, w13, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w14, w9 +; NONEON-NOSVE-NEXT: strh w10, [sp, #28] +; NONEON-NOSVE-NEXT: strh w11, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = lshr <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -367,8 +831,43 @@ define <8 x i8> @lshr_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.8b, v1.8b -; NONEON-NOSVE-NEXT: ushl v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = lshr <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -386,8 +885,74 @@ define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.16b, v1.16b -; NONEON-NOSVE-NEXT: ushl v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = lshr <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -407,13 +972,143 @@ define void @lshr_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: lshr_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.16b, v0.16b -; NONEON-NOSVE-NEXT: neg v1.16b, v1.16b -; NONEON-NOSVE-NEXT: ushl v0.16b, v2.16b, v0.16b -; NONEON-NOSVE-NEXT: ushl v1.16b, v3.16b, v1.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -436,11 +1131,18 @@ define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v2.8b -; NONEON-NOSVE-NEXT: neg v1.2s, v1.2s -; NONEON-NOSVE-NEXT: ushl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: lsr w9, w11, w10 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = lshr <2 x i16> %op1, %op2 ret <2 x i16> %res @@ -458,8 +1160,27 @@ define <4 x i16> @lshr_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.4h, v1.4h -; NONEON-NOSVE-NEXT: ushl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = lshr <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -477,8 +1198,42 @@ define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.8h, v1.8h -; NONEON-NOSVE-NEXT: ushl v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = lshr <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -498,13 +1253,79 @@ define void @lshr_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: lshr_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.8h, v0.8h -; NONEON-NOSVE-NEXT: neg v1.8h, v1.8h -; NONEON-NOSVE-NEXT: ushl v0.8h, v2.8h, v0.8h -; NONEON-NOSVE-NEXT: ushl v1.8h, v3.8h, v1.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -525,8 +1346,17 @@ define <2 x i32> @lshr_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.2s, v1.2s -; NONEON-NOSVE-NEXT: ushl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = lshr <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -544,8 +1374,22 @@ define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ushl v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = lshr <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -565,13 +1409,39 @@ define void @lshr_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: lshr_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.4s, v0.4s -; NONEON-NOSVE-NEXT: neg v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ushl v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: ushl v1.4s, v3.4s, v1.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsr w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -592,8 +1462,14 @@ define <1 x i64> @lshr_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg d1, d1 -; NONEON-NOSVE-NEXT: ushl d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: lsr x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = lshr <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -611,8 +1487,16 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: lshr_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: neg v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ushl v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = lshr <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -632,13 +1516,27 @@ define void @lshr_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: lshr_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: neg v0.2d, v0.2d -; NONEON-NOSVE-NEXT: neg v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ushl v0.2d, v2.2d, v0.2d -; NONEON-NOSVE-NEXT: ushl v1.2d, v3.2d, v1.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: lsr x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: lsr x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsr x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsr x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -664,9 +1562,18 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v2i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0x0000ff000000ff -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: ushl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w11, w10, w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shl <2 x i8> %op1, %op2 ret <2 x i8> %res @@ -685,9 +1592,27 @@ define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: ushl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w11, w12, w11 +; NONEON-NOSVE-NEXT: strh w11, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #12] +; NONEON-NOSVE-NEXT: lsl w10, w11, w10 +; NONEON-NOSVE-NEXT: strh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: lsl w9, w10, w9 +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shl <4 x i8> %op1, %op2 ret <4 x i8> %res @@ -705,7 +1630,43 @@ define <8 x i8> @shl_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shl <8 x i8> %op1, %op2 ret <8 x i8> %res @@ -723,7 +1684,74 @@ define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = shl <16 x i8> %op1, %op2 ret <16 x i8> %res @@ -743,11 +1771,143 @@ define void @shl_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shl_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: ushl v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: ushl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -768,7 +1928,27 @@ define <4 x i16> @shl_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shl <4 x i16> %op1, %op2 ret <4 x i16> %res @@ -786,7 +1966,42 @@ define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = shl <8 x i16> %op1, %op2 ret <8 x i16> %res @@ -806,11 +2021,79 @@ define void @shl_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shl_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: ushl v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: ushl v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -831,7 +2114,17 @@ define <2 x i32> @shl_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = shl <2 x i32> %op1, %op2 ret <2 x i32> %res @@ -849,7 +2142,22 @@ define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = shl <4 x i32> %op1, %op2 ret <4 x i32> %res @@ -869,11 +2177,39 @@ define void @shl_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shl_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: ushl v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: ushl v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: ldp w9, w10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: lsl w11, w10, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -894,7 +2230,14 @@ define <1 x i64> @shl_v1i64(<1 x i64> %op1, <1 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl d0, d0, d1 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 +; NONEON-NOSVE-NEXT: lsl x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = shl <1 x i64> %op1, %op2 ret <1 x i64> %res @@ -912,7 +2255,16 @@ define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: shl_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushl v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %res = shl <2 x i64> %op1, %op2 ret <2 x i64> %res @@ -932,11 +2284,27 @@ define void @shl_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shl_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: ushl v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: ushl v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #56] +; NONEON-NOSVE-NEXT: lsl x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: lsl x8, x9, x8 +; NONEON-NOSVE-NEXT: ldp x9, x10, [sp] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #80] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: lsl x11, x10, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: lsl x8, x9, x8 +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index 42d3b9d8f71f86..fd2d9a8fb80d17 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -19,9 +19,26 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = uitofp <4 x i16> %op1 to <4 x half> ret <4 x half> %res @@ -39,17 +56,43 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ushll v1.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v1.4s -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: str q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = uitofp <8 x i16> %op1 to <8 x half> @@ -69,25 +112,76 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ushll v2.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v0.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ucvtf v2.4s, v2.4s -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ucvtf v3.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v1.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v3.4s -; NONEON-NOSVE-NEXT: stp q2, q0, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #6] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #2] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = uitofp <16 x i16> %op1 to <16 x half> @@ -111,9 +205,15 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ucvtf v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #4] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i16> %op1 to <2 x float> ret <2 x float> %res @@ -131,8 +231,21 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #10] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <4 x i16> %op1 to <4 x float> ret <4 x float> %res @@ -154,15 +267,33 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = uitofp <8 x i16> %op1 to <8 x float> @@ -192,21 +323,57 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: ucvtf v2.4s, v2.4s -; NONEON-NOSVE-NEXT: ucvtf v3.4s, v3.4s -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #44] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #42] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #38] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #36] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #34] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #32] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #120] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: ucvtf s1, s0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: ucvtf s0, s0 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = uitofp <16 x i16> %op1 to <16 x float> @@ -229,9 +396,13 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v1i16_v1f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: umov w8, v0.h[0] -; NONEON-NOSVE-NEXT: ucvtf d0, w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %res = uitofp <1 x i16> %op1 to <1 x double> ret <1 x double> %res @@ -250,10 +421,16 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i16_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d1, #0x00ffff0000ffff -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i16> %op1 to <2 x double> ret <2 x double> %res @@ -275,17 +452,31 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i16_v4f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i16>, ptr %a %res = uitofp <4 x i16> %op1 to <4 x double> @@ -318,26 +509,53 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v8i16_v8f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q0, q2, [x1] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #92] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #88] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #144] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #84] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #80] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #128] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #72] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #68] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #64] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = uitofp <8 x i16> %op1 to <8 x double> @@ -390,42 +608,99 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v16i16_v16f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #336 +; NONEON-NOSVE-NEXT: str x29, [sp, #320] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 336 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: ushll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: ushll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #72] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #40] -; NONEON-NOSVE-NEXT: ushll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ushll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ushll v6.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: ushll v7.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: ucvtf v5.2d, v5.2d -; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: ucvtf v4.2d, v4.2d -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v7.2d -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v6.2d -; NONEON-NOSVE-NEXT: stp q2, q0, [x1, #32] -; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #320] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #44] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #152] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #136] +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: str d1, [sp, #328] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #104] +; NONEON-NOSVE-NEXT: str d0, [sp, #168] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #164] +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #176] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #160] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #156] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #152] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #148] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #144] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #140] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #136] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #192] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #332] +; NONEON-NOSVE-NEXT: ldp q4, q3, [sp, #192] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #328] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #304] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #188] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #184] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #288] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #180] +; NONEON-NOSVE-NEXT: ldp q7, q6, [sp, #288] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #176] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #272] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #172] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #168] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #256] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #224] +; NONEON-NOSVE-NEXT: ldp q2, q5, [sp, #256] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #336 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = uitofp <16 x i16> %op1 to <16 x double> @@ -449,9 +724,18 @@ define <2 x half> @ucvtf_v2i32_v2f16(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: str wzr, [sp, #12] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i32> %op1 to <2 x half> ret <2 x half> %res @@ -469,8 +753,24 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <4 x i32> %op1 to <4 x half> ret <4 x half> %res @@ -492,11 +792,39 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = uitofp <8 x i32> %op1 to <8 x half> @@ -525,17 +853,72 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v16i32_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: ucvtf v2.4s, v2.4s -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ucvtf v3.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v1.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ucvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i32>, ptr %a %res = uitofp <16 x i32> %op1 to <16 x half> @@ -558,7 +941,14 @@ define <2 x float> @ucvtf_v2i32_v2f32(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ucvtf v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i32> %op1 to <2 x float> ret <2 x float> %res @@ -575,7 +965,18 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <4 x i32> %op1 to <4 x float> ret <4 x float> %res @@ -593,10 +994,28 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: ucvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf s1, w9 +; NONEON-NOSVE-NEXT: ucvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = uitofp <8 x i32> %op1 to <8 x float> @@ -620,8 +1039,16 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i32_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i32> %op1 to <2 x double> ret <2 x double> %res @@ -643,15 +1070,23 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: ucvtf_v4i32_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %res = uitofp <4 x i32> %op1 to <4 x double> @@ -681,21 +1116,37 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: ucvtf_v8i32_v8f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: ushll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ushll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ushll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #36] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #32] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: ucvtf d1, d0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: ucvtf d0, d0 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = uitofp <8 x i32> %op1 to <8 x double> @@ -726,14 +1177,17 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] ; NONEON-NOSVE-NEXT: ucvtf s0, x8 -; NONEON-NOSVE-NEXT: fcvt h2, s0 -; NONEON-NOSVE-NEXT: fcvt h0, s1 -; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i64> %op1 to <2 x half> ret <2 x half> %res @@ -758,12 +1212,25 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = uitofp <4 x i64> %op1 to <4 x half> @@ -801,18 +1268,43 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn v2.2s, v2.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d -; NONEON-NOSVE-NEXT: fcvtn2 v2.4s, v3.2d -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v2.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ucvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i64>, ptr %a %res = uitofp <8 x i64> %op1 to <8 x half> @@ -835,8 +1327,14 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i64> %op1 to <2 x float> ret <2 x float> %res @@ -858,11 +1356,19 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = uitofp <4 x i64> %op1 to <4 x float> @@ -891,17 +1397,32 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v8i64_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: ucvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn v1.2s, v1.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v2.2d -; NONEON-NOSVE-NEXT: fcvtn2 v1.4s, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: str q2, [sp, #48] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #64] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88] +; NONEON-NOSVE-NEXT: ucvtf s1, x9 +; NONEON-NOSVE-NEXT: ucvtf s0, x8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i64>, ptr %a %res = uitofp <8 x i64> %op1 to <8 x float> @@ -924,7 +1445,14 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: ucvtf_v2i64_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: ucvtf d1, x9 +; NONEON-NOSVE-NEXT: ucvtf d0, x8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = uitofp <2 x i64> %op1 to <2 x double> ret <2 x double> %res @@ -942,10 +1470,20 @@ define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ucvtf_v4i64_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ucvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: ucvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ucvtf d1, x9 +; NONEON-NOSVE-NEXT: ucvtf d0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ucvtf d1, x9 +; NONEON-NOSVE-NEXT: ucvtf d0, x8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = uitofp <4 x i64> %op1 to <4 x double> @@ -968,9 +1506,26 @@ define <4 x half> @scvtf_v4i16_v4f16(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sitofp <4 x i16> %op1 to <4 x half> ret <4 x half> %res @@ -988,17 +1543,43 @@ define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: sshll v1.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] -; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v1.4s -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: str q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #22] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #20] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #18] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = sitofp <8 x i16> %op1 to <8 x half> @@ -1018,25 +1599,76 @@ define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: sshll v2.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v0.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: scvtf v2.4s, v2.4s -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: scvtf v3.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v2.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v2.8h, v1.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v3.4s -; NONEON-NOSVE-NEXT: stp q2, q0, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #58] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #56] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #54] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #52] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #50] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sitofp <16 x i16> %op1 to <16 x half> @@ -1059,9 +1691,15 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: scvtf v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i16> %op1 to <2 x float> ret <2 x float> %res @@ -1079,8 +1717,21 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <4 x i16> %op1 to <4 x float> ret <4 x float> %res @@ -1102,15 +1753,33 @@ define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = sitofp <8 x i16> %op1 to <8 x float> @@ -1140,21 +1809,57 @@ define void @scvtf_v16i16_v16f32(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: scvtf v2.4s, v2.4s -; NONEON-NOSVE-NEXT: scvtf v3.4s, v3.4s -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #46] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #44] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #42] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #88] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #38] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #36] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #34] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #72] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #32] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #62] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #64] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #58] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #120] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #54] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #112] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #50] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: scvtf s1, w8 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sitofp <16 x i16> %op1 to <16 x float> @@ -1180,10 +1885,16 @@ define <2 x double> @scvtf_v2i16_v2f64(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i16_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #8] +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i16> %op1 to <2 x double> ret <2 x double> %res @@ -1205,17 +1916,29 @@ define void @scvtf_v4i16_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i16_v4f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i16>, ptr %a %res = sitofp <4 x i16> %op1 to <4 x double> @@ -1248,26 +1971,49 @@ define void @scvtf_v8i16_v8f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v8i16_v8f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 160 ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: scvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: scvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q0, q2, [x1] -; NONEON-NOSVE-NEXT: stp q1, q3, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #144] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #128] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #128] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %res = sitofp <8 x i16> %op1 to <8 x double> @@ -1320,42 +2066,92 @@ define void @scvtf_v16i16_v16f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v16i16_v16f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #336 +; NONEON-NOSVE-NEXT: str x29, [sp, #320] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 336 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-96]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #8] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: sshll v1.4s, v1.4h, #0 -; NONEON-NOSVE-NEXT: sshll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: sshll v3.4s, v3.4h, #0 -; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #64] -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: ldr d4, [sp, #88] -; NONEON-NOSVE-NEXT: ldr d6, [sp, #72] -; NONEON-NOSVE-NEXT: ldr d7, [sp, #40] -; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: sshll v6.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: sshll v7.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: scvtf v5.2d, v5.2d -; NONEON-NOSVE-NEXT: scvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: scvtf v4.2d, v4.2d -; NONEON-NOSVE-NEXT: stp q0, q5, [x1] -; NONEON-NOSVE-NEXT: scvtf v0.2d, v7.2d -; NONEON-NOSVE-NEXT: stp q1, q4, [x1, #64] -; NONEON-NOSVE-NEXT: scvtf v1.2d, v6.2d -; NONEON-NOSVE-NEXT: stp q2, q0, [x1, #32] -; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #96] -; NONEON-NOSVE-NEXT: add sp, sp, #96 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #320] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #52] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #44] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #66] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #152] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #72] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #70] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #68] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #56] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #136] +; NONEON-NOSVE-NEXT: ldp d2, d1, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #60] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: str d1, [sp, #328] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #104] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #176] +; NONEON-NOSVE-NEXT: str d0, [sp, #168] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #332] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #192] +; NONEON-NOSVE-NEXT: scvtf d1, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #328] +; NONEON-NOSVE-NEXT: ldp q4, q3, [sp, #192] +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #304] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #288] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: ldp q7, q6, [sp, #288] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #272] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #256] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #224] +; NONEON-NOSVE-NEXT: ldp q2, q5, [sp, #256] +; NONEON-NOSVE-NEXT: stp q3, q4, [x1, #32] +; NONEON-NOSVE-NEXT: stp q6, q7, [x1, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: stp q5, q2, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #336 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sitofp <16 x i16> %op1 to <16 x double> @@ -1379,9 +2175,18 @@ define <2 x half> @scvtf_v2i32_v2f16(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: str wzr, [sp, #12] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i32> %op1 to <2 x half> ret <2 x half> %res @@ -1399,8 +2204,24 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <4 x i32> %op1 to <4 x half> ret <4 x half> %res @@ -1422,11 +2243,39 @@ define <8 x half> @scvtf_v8i32_v8f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v0.8h, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: scvtf s0, w9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sitofp <8 x i32> %op1 to <8 x half> @@ -1448,7 +2297,14 @@ define <2 x float> @scvtf_v2i32_v2f32(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: scvtf v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i32> %op1 to <2 x float> ret <2 x float> %res @@ -1465,7 +2321,18 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <4 x i32> %op1 to <4 x float> ret <4 x float> %res @@ -1483,10 +2350,28 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: scvtf v0.4s, v0.4s -; NONEON-NOSVE-NEXT: scvtf v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf s1, w9 +; NONEON-NOSVE-NEXT: scvtf s0, w8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sitofp <8 x i32> %op1 to <8 x float> @@ -1510,8 +2395,15 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i32_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i32> %op1 to <2 x double> ret <2 x double> %res @@ -1533,15 +2425,21 @@ define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: scvtf_v4i32_v4f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: stp q0, q1, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %res = sitofp <4 x i32> %op1 to <4 x double> @@ -1571,21 +2469,33 @@ define void @scvtf_v8i32_v8f64(ptr %a, ptr %b) { ; NONEON-NOSVE-LABEL: scvtf_v8i32_v8f64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-32]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 -; NONEON-NOSVE-NEXT: ldr d2, [sp, #24] -; NONEON-NOSVE-NEXT: ldr d3, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: scvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-128]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #64] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #112] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #96] +; NONEON-NOSVE-NEXT: stp q2, q3, [x1] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sitofp <8 x i32> %op1 to <8 x double> @@ -1634,36 +2544,68 @@ define void @scvtf_v16i32_v16f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v16i32_v16f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #272 +; NONEON-NOSVE-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 272 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 ; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: ldr x29, [sp, #256] // 8-byte Folded Reload ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] -; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #-64]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 -; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #32] -; NONEON-NOSVE-NEXT: ldr d4, [sp, #24] -; NONEON-NOSVE-NEXT: sshll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: ldr d5, [sp, #56] -; NONEON-NOSVE-NEXT: sshll v3.2d, v3.2s, #0 -; NONEON-NOSVE-NEXT: ldr d6, [sp, #40] -; NONEON-NOSVE-NEXT: sshll v4.2d, v4.2s, #0 -; NONEON-NOSVE-NEXT: ldr d7, [sp, #8] -; NONEON-NOSVE-NEXT: sshll v1.2d, v1.2s, #0 -; NONEON-NOSVE-NEXT: sshll v5.2d, v5.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v2.2d, v2.2d -; NONEON-NOSVE-NEXT: sshll v6.2d, v6.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v3.2d, v3.2d -; NONEON-NOSVE-NEXT: sshll v0.2d, v0.2s, #0 -; NONEON-NOSVE-NEXT: sshll v7.2d, v7.2s, #0 -; NONEON-NOSVE-NEXT: scvtf v4.2d, v4.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: scvtf v5.2d, v5.2d -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: stp q2, q4, [x1, #96] -; NONEON-NOSVE-NEXT: scvtf v2.2d, v6.2d -; NONEON-NOSVE-NEXT: stp q3, q5, [x1, #64] -; NONEON-NOSVE-NEXT: scvtf v3.2d, v7.2d -; NONEON-NOSVE-NEXT: stp q1, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q0, q3, [x1] -; NONEON-NOSVE-NEXT: add sp, sp, #64 +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp q0, q2, [sp] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: str q3, [sp, #64] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #48] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #104] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: str d0, [sp, #264] +; NONEON-NOSVE-NEXT: ldp d0, d2, [sp, #16] +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #88] +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: scvtf d2, w9 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #120] +; NONEON-NOSVE-NEXT: scvtf d0, w9 +; NONEON-NOSVE-NEXT: str d0, [sp, #152] +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #136] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #192] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #208] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #268] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #224] +; NONEON-NOSVE-NEXT: scvtf d1, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldp q4, q6, [sp, #208] +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #240] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldr q7, [sp, #240] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #160] +; NONEON-NOSVE-NEXT: scvtf d1, w9 +; NONEON-NOSVE-NEXT: scvtf d0, w8 +; NONEON-NOSVE-NEXT: ldr q5, [sp, #160] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #176] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q2, q3, [sp, #176] +; NONEON-NOSVE-NEXT: stp q7, q6, [x1, #64] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: stp q4, q3, [x1, #32] +; NONEON-NOSVE-NEXT: stp q2, q5, [x1, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #272 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i32>, ptr %a %res = sitofp <16 x i32> %op1 to <16 x double> @@ -1694,14 +2636,17 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov x8, v0.d[1] -; NONEON-NOSVE-NEXT: fmov x9, d0 -; NONEON-NOSVE-NEXT: scvtf s1, x9 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: scvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] ; NONEON-NOSVE-NEXT: scvtf s0, x8 -; NONEON-NOSVE-NEXT: fcvt h2, s0 -; NONEON-NOSVE-NEXT: fcvt h0, s1 -; NONEON-NOSVE-NEXT: mov v0.h[1], v2.h[0] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i64> %op1 to <2 x half> ret <2 x half> %res @@ -1726,12 +2671,25 @@ define <4 x half> @scvtf_v4i64_v4f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: scvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: scvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: scvtf s0, x9 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: scvtf s0, x8 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sitofp <4 x i64> %op1 to <4 x half> @@ -1754,8 +2712,14 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: scvtf s1, x9 +; NONEON-NOSVE-NEXT: scvtf s0, x8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i64> %op1 to <2 x float> ret <2 x float> %res @@ -1777,11 +2741,19 @@ define <4 x float> @scvtf_v4i64_v4f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d -; NONEON-NOSVE-NEXT: fcvtn v0.2s, v0.2d -; NONEON-NOSVE-NEXT: fcvtn2 v0.4s, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: scvtf s1, x9 +; NONEON-NOSVE-NEXT: scvtf s0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #40] +; NONEON-NOSVE-NEXT: scvtf s1, x9 +; NONEON-NOSVE-NEXT: scvtf s0, x8 +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sitofp <4 x i64> %op1 to <4 x float> @@ -1803,7 +2775,14 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: scvtf_v2i64_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: scvtf d1, x9 +; NONEON-NOSVE-NEXT: scvtf d0, x8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sitofp <2 x i64> %op1 to <2 x double> ret <2 x double> %res @@ -1821,10 +2800,20 @@ define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: scvtf_v4i64_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: scvtf v0.2d, v0.2d -; NONEON-NOSVE-NEXT: scvtf v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: scvtf d1, x9 +; NONEON-NOSVE-NEXT: scvtf d0, x8 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #48] +; NONEON-NOSVE-NEXT: scvtf d1, x9 +; NONEON-NOSVE-NEXT: scvtf d0, x8 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sitofp <4 x i64> %op1 to <4 x double> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll index 250929df6b3c35..af15d5f67ad15c 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -22,9 +22,40 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.4h, v2.4h, #15 -; NONEON-NOSVE-NEXT: cmlt v2.4h, v2.4h, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #18] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: csel w9, w13, w12, ne +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #4] +; NONEON-NOSVE-NEXT: tst w11, #0xffff +; NONEON-NOSVE-NEXT: strh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2] +; NONEON-NOSVE-NEXT: csel w9, w12, w9, ne +; NONEON-NOSVE-NEXT: tst w10, #0xffff +; NONEON-NOSVE-NEXT: ldrh w10, [sp] +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csel w9, w11, w9, ne +; NONEON-NOSVE-NEXT: tst w8, #0xffff +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w10, w9, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x i8> %op1, <4 x i8> %op2 ret <4 x i8> %sel @@ -47,9 +78,68 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.8b, v2.8b, #7 -; NONEON-NOSVE-NEXT: cmlt v2.8b, v2.8b, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #23] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #7] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #21] +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #20] +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #18] +; NONEON-NOSVE-NEXT: tst w13, #0xff +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #17] +; NONEON-NOSVE-NEXT: csel w13, w17, w16, ne +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #6] +; NONEON-NOSVE-NEXT: tst w15, #0xff +; NONEON-NOSVE-NEXT: strb w13, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #5] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: csel w13, w16, w13, ne +; NONEON-NOSVE-NEXT: tst w14, #0xff +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #4] +; NONEON-NOSVE-NEXT: strb w13, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: csel w13, w15, w13, ne +; NONEON-NOSVE-NEXT: tst w12, #0xff +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: strb w13, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #12] +; NONEON-NOSVE-NEXT: csel w12, w14, w13, ne +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #3] +; NONEON-NOSVE-NEXT: tst w11, #0xff +; NONEON-NOSVE-NEXT: strb w12, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #11] +; NONEON-NOSVE-NEXT: csel w11, w13, w12, ne +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #2] +; NONEON-NOSVE-NEXT: tst w10, #0xff +; NONEON-NOSVE-NEXT: strb w11, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: csel w10, w12, w11, ne +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #1] +; NONEON-NOSVE-NEXT: tst w9, #0xff +; NONEON-NOSVE-NEXT: strb w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #9] +; NONEON-NOSVE-NEXT: csel w9, w11, w10, ne +; NONEON-NOSVE-NEXT: ldrb w10, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0xff +; NONEON-NOSVE-NEXT: strb w9, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w10, w9, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <8 x i1> %mask, <8 x i8> %op1, <8 x i8> %op2 ret <8 x i8> %sel @@ -72,9 +162,124 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) ; ; NONEON-NOSVE-LABEL: select_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.16b, v2.16b, #7 -; NONEON-NOSVE-NEXT: cmlt v2.16b, v2.16b, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q1, q2, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #47] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #45] +; NONEON-NOSVE-NEXT: sbfx w2, w2, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w4, w4, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #44] +; NONEON-NOSVE-NEXT: sbfx w3, w3, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #42] +; NONEON-NOSVE-NEXT: tst w2, #0xff +; NONEON-NOSVE-NEXT: sbfx w1, w1, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #41] +; NONEON-NOSVE-NEXT: csel w2, w6, w5, ne +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #14] +; NONEON-NOSVE-NEXT: tst w4, #0xff +; NONEON-NOSVE-NEXT: strb w2, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #13] +; NONEON-NOSVE-NEXT: sbfx w0, w0, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w18, w18, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w17, w17, #0, #1 +; NONEON-NOSVE-NEXT: csel w2, w5, w2, ne +; NONEON-NOSVE-NEXT: tst w3, #0xff +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #12] +; NONEON-NOSVE-NEXT: strb w2, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #37] +; NONEON-NOSVE-NEXT: csel w2, w4, w2, ne +; NONEON-NOSVE-NEXT: tst w1, #0xff +; NONEON-NOSVE-NEXT: sbfx w16, w16, #0, #1 +; NONEON-NOSVE-NEXT: strb w2, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #28] +; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #36] +; NONEON-NOSVE-NEXT: csel w1, w3, w2, ne +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #11] +; NONEON-NOSVE-NEXT: tst w0, #0xff +; NONEON-NOSVE-NEXT: strb w1, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #27] +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: csel w0, w2, w1, ne +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #10] +; NONEON-NOSVE-NEXT: tst w18, #0xff +; NONEON-NOSVE-NEXT: strb w0, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #26] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: csel w18, w1, w0, ne +; NONEON-NOSVE-NEXT: ldrb w0, [sp, #9] +; NONEON-NOSVE-NEXT: tst w17, #0xff +; NONEON-NOSVE-NEXT: strb w18, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #25] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: csel w17, w0, w18, ne +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #8] +; NONEON-NOSVE-NEXT: tst w16, #0xff +; NONEON-NOSVE-NEXT: strb w17, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #24] +; NONEON-NOSVE-NEXT: csel w16, w18, w17, ne +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #7] +; NONEON-NOSVE-NEXT: tst w15, #0xff +; NONEON-NOSVE-NEXT: strb w16, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #23] +; NONEON-NOSVE-NEXT: csel w15, w17, w16, ne +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #6] +; NONEON-NOSVE-NEXT: tst w14, #0xff +; NONEON-NOSVE-NEXT: strb w15, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #22] +; NONEON-NOSVE-NEXT: csel w14, w16, w15, ne +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #5] +; NONEON-NOSVE-NEXT: tst w13, #0xff +; NONEON-NOSVE-NEXT: strb w14, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #21] +; NONEON-NOSVE-NEXT: csel w13, w15, w14, ne +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #4] +; NONEON-NOSVE-NEXT: tst w12, #0xff +; NONEON-NOSVE-NEXT: strb w13, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #20] +; NONEON-NOSVE-NEXT: csel w12, w14, w13, ne +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #3] +; NONEON-NOSVE-NEXT: tst w11, #0xff +; NONEON-NOSVE-NEXT: strb w12, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #19] +; NONEON-NOSVE-NEXT: csel w11, w13, w12, ne +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #2] +; NONEON-NOSVE-NEXT: tst w10, #0xff +; NONEON-NOSVE-NEXT: strb w11, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #18] +; NONEON-NOSVE-NEXT: csel w10, w12, w11, ne +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #1] +; NONEON-NOSVE-NEXT: tst w9, #0xff +; NONEON-NOSVE-NEXT: strb w10, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #17] +; NONEON-NOSVE-NEXT: csel w9, w11, w10, ne +; NONEON-NOSVE-NEXT: ldrb w10, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0xff +; NONEON-NOSVE-NEXT: strb w9, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w10, w9, ne +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <16 x i1> %mask, <16 x i8> %op1, <16 x i8> %op2 ret <16 x i8> %sel @@ -95,14 +300,204 @@ define void @select_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: cmeq v4.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: cmeq v5.16b, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #208 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #112] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #128] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #144] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #160] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 208 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -8 +; NONEON-NOSVE-NEXT: .cfi_offset w20, -16 +; NONEON-NOSVE-NEXT: .cfi_offset w21, -24 +; NONEON-NOSVE-NEXT: .cfi_offset w22, -32 +; NONEON-NOSVE-NEXT: .cfi_offset w23, -40 +; NONEON-NOSVE-NEXT: .cfi_offset w24, -48 +; NONEON-NOSVE-NEXT: .cfi_offset w25, -56 +; NONEON-NOSVE-NEXT: .cfi_offset w26, -64 +; NONEON-NOSVE-NEXT: .cfi_offset w27, -72 +; NONEON-NOSVE-NEXT: .cfi_offset w28, -80 +; NONEON-NOSVE-NEXT: .cfi_offset w30, -88 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -96 +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #19] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #20] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, eq +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #37] +; NONEON-NOSVE-NEXT: csel w9, w11, w10, eq +; NONEON-NOSVE-NEXT: cmp w13, w12 +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #21] +; NONEON-NOSVE-NEXT: str w8, [sp, #12] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: csel w8, w13, w12, eq +; NONEON-NOSVE-NEXT: cmp w16, w14 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #22] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #4] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: csel w8, w16, w14, eq +; NONEON-NOSVE-NEXT: cmp w1, w18 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #23] +; NONEON-NOSVE-NEXT: csel w12, w1, w18, eq +; NONEON-NOSVE-NEXT: cmp w2, w13 +; NONEON-NOSVE-NEXT: ldrb w18, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #24] +; NONEON-NOSVE-NEXT: csel w13, w2, w13, eq +; NONEON-NOSVE-NEXT: cmp w16, w14 +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #25] +; NONEON-NOSVE-NEXT: csel w14, w16, w14, eq +; NONEON-NOSVE-NEXT: cmp w1, w18 +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #26] +; NONEON-NOSVE-NEXT: csel w16, w1, w18, eq +; NONEON-NOSVE-NEXT: ldrb w1, [sp, #42] +; NONEON-NOSVE-NEXT: cmp w5, w2 +; NONEON-NOSVE-NEXT: csel w18, w5, w2, eq +; NONEON-NOSVE-NEXT: ldrb w2, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w5, [sp, #27] +; NONEON-NOSVE-NEXT: cmp w6, w1 +; NONEON-NOSVE-NEXT: ldrb w19, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #45] +; NONEON-NOSVE-NEXT: csel w1, w6, w1, eq +; NONEON-NOSVE-NEXT: ldrb w6, [sp, #44] +; NONEON-NOSVE-NEXT: cmp w5, w2 +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #29] +; NONEON-NOSVE-NEXT: str w8, [sp] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: csel w2, w5, w2, eq +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: cmp w19, w6 +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w21, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #31] +; NONEON-NOSVE-NEXT: csel w5, w19, w6, eq +; NONEON-NOSVE-NEXT: cmp w30, w29 +; NONEON-NOSVE-NEXT: ldrb w22, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #48] +; NONEON-NOSVE-NEXT: csel w6, w30, w29, eq +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #49] +; NONEON-NOSVE-NEXT: csel w19, w8, w9, eq +; NONEON-NOSVE-NEXT: cmp w10, w21 +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: csel w21, w10, w21, eq +; NONEON-NOSVE-NEXT: cmp w11, w22 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #51] +; NONEON-NOSVE-NEXT: csel w22, w11, w22, eq +; NONEON-NOSVE-NEXT: cmp w29, w28 +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #52] +; NONEON-NOSVE-NEXT: csel w11, w29, w28, eq +; NONEON-NOSVE-NEXT: cmp w8, w27 +; NONEON-NOSVE-NEXT: ldrb w24, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #53] +; NONEON-NOSVE-NEXT: csel w8, w8, w27, eq +; NONEON-NOSVE-NEXT: cmp w9, w26 +; NONEON-NOSVE-NEXT: ldrb w23, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #54] +; NONEON-NOSVE-NEXT: csel w9, w9, w26, eq +; NONEON-NOSVE-NEXT: cmp w10, w25 +; NONEON-NOSVE-NEXT: ldrb w20, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #55] +; NONEON-NOSVE-NEXT: csel w10, w10, w25, eq +; NONEON-NOSVE-NEXT: cmp w28, w24 +; NONEON-NOSVE-NEXT: ldrb w7, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #56] +; NONEON-NOSVE-NEXT: csel w24, w28, w24, eq +; NONEON-NOSVE-NEXT: cmp w27, w23 +; NONEON-NOSVE-NEXT: ldrb w4, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #57] +; NONEON-NOSVE-NEXT: csel w23, w27, w23, eq +; NONEON-NOSVE-NEXT: cmp w26, w20 +; NONEON-NOSVE-NEXT: ldrb w3, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #58] +; NONEON-NOSVE-NEXT: csel w20, w26, w20, eq +; NONEON-NOSVE-NEXT: cmp w25, w7 +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #59] +; NONEON-NOSVE-NEXT: csel w7, w25, w7, eq +; NONEON-NOSVE-NEXT: cmp w28, w4 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #60] +; NONEON-NOSVE-NEXT: csel w4, w28, w4, eq +; NONEON-NOSVE-NEXT: cmp w27, w3 +; NONEON-NOSVE-NEXT: csel w3, w27, w3, eq +; NONEON-NOSVE-NEXT: cmp w26, w17 +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w27, [sp, #61] +; NONEON-NOSVE-NEXT: csel w17, w26, w17, eq +; NONEON-NOSVE-NEXT: cmp w25, w15 +; NONEON-NOSVE-NEXT: ldrb w26, [sp, #78] +; NONEON-NOSVE-NEXT: csel w15, w25, w15, eq +; NONEON-NOSVE-NEXT: ldrb w25, [sp, #62] +; NONEON-NOSVE-NEXT: cmp w27, w28 +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w30, [sp, #63] +; NONEON-NOSVE-NEXT: strb w9, [sp, #99] +; NONEON-NOSVE-NEXT: csel w27, w27, w28, eq +; NONEON-NOSVE-NEXT: cmp w25, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #98] +; NONEON-NOSVE-NEXT: csel w25, w25, w26, eq +; NONEON-NOSVE-NEXT: cmp w30, w29 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: csel w26, w30, w29, eq +; NONEON-NOSVE-NEXT: ldrb w28, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w29, [sp, #16] +; NONEON-NOSVE-NEXT: strb w26, [sp, #111] +; NONEON-NOSVE-NEXT: strb w9, [sp, #84] +; NONEON-NOSVE-NEXT: cmp w29, w28 +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w25, [sp, #110] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #144] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w27, [sp, #109] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: csel w8, w29, w28, eq +; NONEON-NOSVE-NEXT: strb w15, [sp, #108] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #128] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w17, [sp, #107] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #112] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w3, [sp, #106] +; NONEON-NOSVE-NEXT: strb w4, [sp, #105] +; NONEON-NOSVE-NEXT: strb w7, [sp, #104] +; NONEON-NOSVE-NEXT: strb w20, [sp, #103] +; NONEON-NOSVE-NEXT: strb w23, [sp, #102] +; NONEON-NOSVE-NEXT: strb w24, [sp, #101] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #160] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w10, [sp, #100] +; NONEON-NOSVE-NEXT: strb w11, [sp, #97] +; NONEON-NOSVE-NEXT: strb w22, [sp, #96] +; NONEON-NOSVE-NEXT: strb w21, [sp, #95] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w19, [sp, #94] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w6, [sp, #93] +; NONEON-NOSVE-NEXT: strb w5, [sp, #92] +; NONEON-NOSVE-NEXT: strb w2, [sp, #91] +; NONEON-NOSVE-NEXT: strb w1, [sp, #90] +; NONEON-NOSVE-NEXT: strb w18, [sp, #89] +; NONEON-NOSVE-NEXT: strb w16, [sp, #88] +; NONEON-NOSVE-NEXT: strb w14, [sp, #87] +; NONEON-NOSVE-NEXT: strb w13, [sp, #86] +; NONEON-NOSVE-NEXT: strb w12, [sp, #85] +; NONEON-NOSVE-NEXT: strb w9, [sp, #82] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #80] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #208 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -129,9 +524,25 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.2s, v2.2s, #31 -; NONEON-NOSVE-NEXT: cmlt v2.2s, v2.2s, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #4] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: csel w8, w11, w10, ne +; NONEON-NOSVE-NEXT: ldr w10, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w10, w8, ne +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x i16> %op1, <2 x i16> %op2 ret <2 x i16> %sel @@ -154,9 +565,40 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.4h, v2.4h, #15 -; NONEON-NOSVE-NEXT: cmlt v2.4h, v2.4h, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #18] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: csel w9, w13, w12, ne +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #4] +; NONEON-NOSVE-NEXT: tst w11, #0xffff +; NONEON-NOSVE-NEXT: strh w9, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2] +; NONEON-NOSVE-NEXT: csel w9, w12, w9, ne +; NONEON-NOSVE-NEXT: tst w10, #0xffff +; NONEON-NOSVE-NEXT: ldrh w10, [sp] +; NONEON-NOSVE-NEXT: strh w9, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: csel w9, w11, w9, ne +; NONEON-NOSVE-NEXT: tst w8, #0xffff +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w10, w9, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x i16> %op1, <4 x i16> %op2 ret <4 x i16> %sel @@ -180,10 +622,68 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v2.8h, v2.8b, #0 -; NONEON-NOSVE-NEXT: shl v2.8h, v2.8h, #15 -; NONEON-NOSVE-NEXT: cmlt v2.8h, v2.8h, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #47] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #45] +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #44] +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #42] +; NONEON-NOSVE-NEXT: tst w13, #0xffff +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: csel w13, w17, w16, ne +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #12] +; NONEON-NOSVE-NEXT: tst w15, #0xffff +; NONEON-NOSVE-NEXT: strh w13, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #10] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: csel w13, w16, w13, ne +; NONEON-NOSVE-NEXT: tst w14, #0xffff +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #8] +; NONEON-NOSVE-NEXT: strh w13, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: csel w13, w15, w13, ne +; NONEON-NOSVE-NEXT: tst w12, #0xffff +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: strh w13, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #24] +; NONEON-NOSVE-NEXT: csel w12, w14, w13, ne +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #6] +; NONEON-NOSVE-NEXT: tst w11, #0xffff +; NONEON-NOSVE-NEXT: strh w12, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #22] +; NONEON-NOSVE-NEXT: csel w11, w13, w12, ne +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #4] +; NONEON-NOSVE-NEXT: tst w10, #0xffff +; NONEON-NOSVE-NEXT: strh w11, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #20] +; NONEON-NOSVE-NEXT: csel w10, w12, w11, ne +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2] +; NONEON-NOSVE-NEXT: tst w9, #0xffff +; NONEON-NOSVE-NEXT: strh w10, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #18] +; NONEON-NOSVE-NEXT: csel w9, w11, w10, ne +; NONEON-NOSVE-NEXT: ldrh w10, [sp] +; NONEON-NOSVE-NEXT: tst w8, #0xffff +; NONEON-NOSVE-NEXT: strh w9, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w10, w9, ne +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <8 x i1> %mask, <8 x i16> %op1, <8 x i16> %op2 ret <8 x i16> %sel @@ -204,14 +704,98 @@ define void @select_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: cmeq v4.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: cmeq v5.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #112 +; NONEON-NOSVE-NEXT: str x19, [sp, #96] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 112 +; NONEON-NOSVE-NEXT: .cfi_offset w19, -16 +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #6] +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w9, w8, eq +; NONEON-NOSVE-NEXT: cmp w13, w12 +; NONEON-NOSVE-NEXT: ldrh w1, [sp, #12] +; NONEON-NOSVE-NEXT: csel w9, w13, w12, eq +; NONEON-NOSVE-NEXT: cmp w15, w14 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #10] +; NONEON-NOSVE-NEXT: csel w14, w15, w14, eq +; NONEON-NOSVE-NEXT: cmp w17, w16 +; NONEON-NOSVE-NEXT: csel w16, w17, w16, eq +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #28] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #32] +; NONEON-NOSVE-NEXT: cmp w13, w12 +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #14] +; NONEON-NOSVE-NEXT: csel w12, w13, w12, eq +; NONEON-NOSVE-NEXT: cmp w1, w17 +; NONEON-NOSVE-NEXT: ldrh w5, [sp, #32] +; NONEON-NOSVE-NEXT: csel w17, w1, w17, eq +; NONEON-NOSVE-NEXT: ldrh w1, [sp, #48] +; NONEON-NOSVE-NEXT: cmp w4, w3 +; NONEON-NOSVE-NEXT: ldrh w6, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w7, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #52] +; NONEON-NOSVE-NEXT: csel w3, w4, w3, eq +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #36] +; NONEON-NOSVE-NEXT: cmp w5, w1 +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w19, [sp, #38] +; NONEON-NOSVE-NEXT: csel w1, w5, w1, eq +; NONEON-NOSVE-NEXT: cmp w7, w6 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w5, [sp, #40] +; NONEON-NOSVE-NEXT: csel w6, w7, w6, eq +; NONEON-NOSVE-NEXT: cmp w4, w2 +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w7, [sp, #42] +; NONEON-NOSVE-NEXT: csel w2, w4, w2, eq +; NONEON-NOSVE-NEXT: cmp w19, w13 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #44] +; NONEON-NOSVE-NEXT: csel w13, w19, w13, eq +; NONEON-NOSVE-NEXT: cmp w5, w18 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w19, [sp, #46] +; NONEON-NOSVE-NEXT: csel w18, w5, w18, eq +; NONEON-NOSVE-NEXT: cmp w7, w15 +; NONEON-NOSVE-NEXT: ldrh w5, [sp] +; NONEON-NOSVE-NEXT: csel w15, w7, w15, eq +; NONEON-NOSVE-NEXT: cmp w4, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: csel w11, w4, w11, eq +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w19, w10 +; NONEON-NOSVE-NEXT: csel w10, w19, w10, eq +; NONEON-NOSVE-NEXT: strh w11, [sp, #92] +; NONEON-NOSVE-NEXT: ldr x19, [sp, #96] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: cmp w5, w4 +; NONEON-NOSVE-NEXT: strh w10, [sp, #94] +; NONEON-NOSVE-NEXT: csel w8, w5, w4, eq +; NONEON-NOSVE-NEXT: strh w15, [sp, #90] +; NONEON-NOSVE-NEXT: strh w18, [sp, #88] +; NONEON-NOSVE-NEXT: strh w13, [sp, #86] +; NONEON-NOSVE-NEXT: strh w2, [sp, #84] +; NONEON-NOSVE-NEXT: strh w6, [sp, #82] +; NONEON-NOSVE-NEXT: strh w1, [sp, #80] +; NONEON-NOSVE-NEXT: strh w3, [sp, #78] +; NONEON-NOSVE-NEXT: strh w17, [sp, #76] +; NONEON-NOSVE-NEXT: strh w12, [sp, #74] +; NONEON-NOSVE-NEXT: strh w16, [sp, #72] +; NONEON-NOSVE-NEXT: strh w14, [sp, #70] +; NONEON-NOSVE-NEXT: strh w9, [sp, #68] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #112 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -238,9 +822,25 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v2.2s, v2.2s, #31 -; NONEON-NOSVE-NEXT: cmlt v2.2s, v2.2s, #0 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #12] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #4] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: csel w8, w11, w10, ne +; NONEON-NOSVE-NEXT: ldr w10, [sp] +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: csel w8, w10, w8, ne +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x i32> %op1, <2 x i32> %op2 ret <2 x i32> %sel @@ -264,10 +864,40 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v2.4s, v2.4h, #0 -; NONEON-NOSVE-NEXT: shl v2.4s, v2.4s, #31 -; NONEON-NOSVE-NEXT: cmlt v2.4s, v2.4s, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w13, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #42] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: cmp w9, #0 +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: csel w9, w13, w12, ne +; NONEON-NOSVE-NEXT: ldr w12, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w11, #0 +; NONEON-NOSVE-NEXT: str w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #4] +; NONEON-NOSVE-NEXT: csel w9, w12, w9, ne +; NONEON-NOSVE-NEXT: cmp w10, #0 +; NONEON-NOSVE-NEXT: ldr w10, [sp] +; NONEON-NOSVE-NEXT: str w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: csel w9, w11, w9, ne +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: str w9, [sp, #52] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w10, w9, ne +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <4 x i1> %mask, <4 x i32> %op1, <4 x i32> %op2 ret <4 x i32> %sel @@ -288,14 +918,43 @@ define void @select_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: cmeq v4.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: cmeq v5.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w10, w13, [sp, #4] +; NONEON-NOSVE-NEXT: ldp w12, w11, [sp, #24] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w10, w9 +; NONEON-NOSVE-NEXT: csel w9, w10, w9, eq +; NONEON-NOSVE-NEXT: cmp w13, w12 +; NONEON-NOSVE-NEXT: ldp w15, w16, [sp, #48] +; NONEON-NOSVE-NEXT: csel w12, w13, w12, eq +; NONEON-NOSVE-NEXT: cmp w14, w11 +; NONEON-NOSVE-NEXT: ldp w10, w13, [sp, #32] +; NONEON-NOSVE-NEXT: csel w11, w14, w11, eq +; NONEON-NOSVE-NEXT: ldp w17, w14, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w18, w1, [sp, #40] +; NONEON-NOSVE-NEXT: cmp w10, w15 +; NONEON-NOSVE-NEXT: stp w12, w11, [sp, #72] +; NONEON-NOSVE-NEXT: csel w10, w10, w15, eq +; NONEON-NOSVE-NEXT: cmp w13, w16 +; NONEON-NOSVE-NEXT: ldr w15, [sp] +; NONEON-NOSVE-NEXT: csel w13, w13, w16, eq +; NONEON-NOSVE-NEXT: cmp w18, w17 +; NONEON-NOSVE-NEXT: csel w16, w18, w17, eq +; NONEON-NOSVE-NEXT: cmp w1, w14 +; NONEON-NOSVE-NEXT: stp w10, w13, [sp, #80] +; NONEON-NOSVE-NEXT: csel w10, w1, w14, eq +; NONEON-NOSVE-NEXT: cmp w15, w8 +; NONEON-NOSVE-NEXT: csel w8, w15, w8, eq +; NONEON-NOSVE-NEXT: stp w16, w10, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -321,10 +980,15 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v1i64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d1 +; NONEON-NOSVE-NEXT: fmov x9, d0 ; NONEON-NOSVE-NEXT: tst w0, #0x1 -; NONEON-NOSVE-NEXT: csetm x8, ne -; NONEON-NOSVE-NEXT: fmov d2, x8 -; NONEON-NOSVE-NEXT: bif v0.8b, v1.8b, v2.8b +; NONEON-NOSVE-NEXT: csel x8, x9, x8, ne +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %sel = select <1 x i1> %mask, <1 x i64> %op1, <1 x i64> %op2 ret <1 x i64> %sel @@ -348,10 +1012,25 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: select_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ushll v2.2d, v2.2s, #0 -; NONEON-NOSVE-NEXT: shl v2.2d, v2.2d, #63 -; NONEON-NOSVE-NEXT: cmlt v2.2d, v2.2d, #0 -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d2, [sp, #40] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x11, [sp, #8] +; NONEON-NOSVE-NEXT: sbfx x8, x8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx x9, x9, #0, #1 +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: csel x8, x11, x10, ne +; NONEON-NOSVE-NEXT: ldr x10, [sp] +; NONEON-NOSVE-NEXT: cmp x9, #0 +; NONEON-NOSVE-NEXT: str x8, [sp, #56] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: csel x8, x10, x8, ne +; NONEON-NOSVE-NEXT: str x8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %sel = select <2 x i1> %mask, <2 x i64> %op1, <2 x i64> %op2 ret <2 x i64> %sel @@ -372,14 +1051,30 @@ define void @select_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: select_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: cmeq v4.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: cmeq v5.2d, v2.2d, v3.2d -; NONEON-NOSVE-NEXT: bif v0.16b, v1.16b, v4.16b -; NONEON-NOSVE-NEXT: mov v1.16b, v5.16b -; NONEON-NOSVE-NEXT: bsl v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-96]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: stp q2, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #8] +; NONEON-NOSVE-NEXT: ldp x8, x11, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x13, [sp, #40] +; NONEON-NOSVE-NEXT: ldp x10, x12, [sp, #48] +; NONEON-NOSVE-NEXT: cmp x9, x8 +; NONEON-NOSVE-NEXT: csel x8, x9, x8, eq +; NONEON-NOSVE-NEXT: cmp x11, x10 +; NONEON-NOSVE-NEXT: csel x9, x11, x10, eq +; NONEON-NOSVE-NEXT: ldr x10, [sp, #16] +; NONEON-NOSVE-NEXT: ldr x11, [sp] +; NONEON-NOSVE-NEXT: cmp x13, x12 +; NONEON-NOSVE-NEXT: csel x12, x13, x12, eq +; NONEON-NOSVE-NEXT: cmp x11, x10 +; NONEON-NOSVE-NEXT: stp x9, x12, [sp, #80] +; NONEON-NOSVE-NEXT: csel x9, x11, x10, eq +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll index 0b6152340f65ab..66d544d0acbf56 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll @@ -33,19 +33,23 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind { ; ; NONEON-NOSVE-LABEL: alloc_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #32 -; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: mov x19, x0 -; NONEON-NOSVE-NEXT: add x0, sp, #12 +; NONEON-NOSVE-NEXT: add x0, sp, #28 ; NONEON-NOSVE-NEXT: bl def -; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: umov w8, v0.h[2] -; NONEON-NOSVE-NEXT: umov w9, v0.h[0] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] ; NONEON-NOSVE-NEXT: strb w8, [x19, #1] ; NONEON-NOSVE-NEXT: strb w9, [x19] -; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %alloc = alloca [4 x i8] call void @def(ptr %alloc) @@ -88,21 +92,25 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind { ; ; NONEON-NOSVE-LABEL: alloc_v6i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #32 -; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: mov x19, x0 -; NONEON-NOSVE-NEXT: add x0, sp, #8 +; NONEON-NOSVE-NEXT: add x0, sp, #24 ; NONEON-NOSVE-NEXT: bl def -; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] -; NONEON-NOSVE-NEXT: add x9, x19, #2 -; NONEON-NOSVE-NEXT: rev16 v1.16b, v0.16b -; NONEON-NOSVE-NEXT: xtn v1.8b, v1.8h -; NONEON-NOSVE-NEXT: str s1, [sp, #4] -; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] -; NONEON-NOSVE-NEXT: st1 { v0.b }[5], [x9] -; NONEON-NOSVE-NEXT: strh w8, [x19] -; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: add sp, sp, #32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: str x8, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [x19, #2] +; NONEON-NOSVE-NEXT: strh w9, [x19] +; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %alloc = alloca [6 x i8] call void @def(ptr %alloc) @@ -135,18 +143,38 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind { ; ; NONEON-NOSVE-LABEL: alloc_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #48 -; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #112 +; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: mov x19, x0 -; NONEON-NOSVE-NEXT: mov x0, sp +; NONEON-NOSVE-NEXT: add x0, sp, #64 ; NONEON-NOSVE-NEXT: bl def -; NONEON-NOSVE-NEXT: ldp q0, q1, [sp] -; NONEON-NOSVE-NEXT: add x8, x19, #8 -; NONEON-NOSVE-NEXT: xtn v0.8b, v0.8h -; NONEON-NOSVE-NEXT: st1 { v1.b }[0], [x8] -; NONEON-NOSVE-NEXT: str d0, [x19] -; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: add sp, sp, #48 +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [x19, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: str x8, [x19] +; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #112 ; NONEON-NOSVE-NEXT: ret %alloc = alloca [32 x i8] call void @def(ptr %alloc) @@ -179,18 +207,26 @@ define void @alloc_v8f64(ptr %st_ptr) nounwind { ; ; NONEON-NOSVE-LABEL: alloc_v8f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #80 -; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #176 +; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #160] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: mov x19, x0 -; NONEON-NOSVE-NEXT: mov x0, sp +; NONEON-NOSVE-NEXT: add x0, sp, #96 ; NONEON-NOSVE-NEXT: bl def -; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #32] -; NONEON-NOSVE-NEXT: ldp q3, q2, [sp] -; NONEON-NOSVE-NEXT: zip1 v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: zip1 v1.2d, v3.2d, v2.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #96] +; NONEON-NOSVE-NEXT: ldp q2, q3, [sp, #128] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: stp q1, q0, [x19] -; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload -; NONEON-NOSVE-NEXT: add sp, sp, #80 +; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #160] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #176 ; NONEON-NOSVE-NEXT: ret %alloc = alloca [8 x double] call void @def(ptr %alloc) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll index 42c439ca4b38d4..3b83f982b6bfc5 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll @@ -22,15 +22,68 @@ define <4 x i32> @test(ptr %arg1, ptr %arg2) { ; ; NONEON-NOSVE-LABEL: test: ; NONEON-NOSVE: // %bb.0: // %entry -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q3, q4, [x0] -; NONEON-NOSVE-NEXT: add v2.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: add v5.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: dup v0.4s, v1.s[2] -; NONEON-NOSVE-NEXT: add v1.4s, v3.4s, v3.4s -; NONEON-NOSVE-NEXT: add v3.4s, v4.4s, v4.4s -; NONEON-NOSVE-NEXT: stp q2, q5, [x0, #32] -; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #60] +; NONEON-NOSVE-NEXT: str q2, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w9, [sp, #124] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldr q0, [sp] +; NONEON-NOSVE-NEXT: str w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #52] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w9, [sp, #116] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #28] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w9, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w9, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #44] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #100] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q4, q2, [sp, #80] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q3, q1, [sp, #112] +; NONEON-NOSVE-NEXT: stp q4, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q2, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret entry: %0 = load <16 x i32>, ptr %arg1, align 256 @@ -59,15 +112,71 @@ define <2 x i32> @test2(ptr %arg1, ptr %arg2) { ; ; NONEON-NOSVE-LABEL: test2: ; NONEON-NOSVE: // %bb.0: // %entry -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q3, q4, [x0] -; NONEON-NOSVE-NEXT: add v2.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: dup v0.2s, v1.s[2] -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: add v3.4s, v3.4s, v3.4s -; NONEON-NOSVE-NEXT: add v4.4s, v4.4s, v4.4s -; NONEON-NOSVE-NEXT: stp q2, q1, [x0, #32] -; NONEON-NOSVE-NEXT: stp q3, q4, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 144 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: str q2, [sp, #16] +; NONEON-NOSVE-NEXT: str d0, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #124] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #120] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #116] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #108] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #104] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #100] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp q4, q2, [sp, #80] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q3, q1, [sp, #112] +; NONEON-NOSVE-NEXT: stp q4, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q2, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret entry: %0 = load <16 x i32>, ptr %arg1, align 256 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll index 992b667a2eafe1..c97a3c2e721a3d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll @@ -15,9 +15,18 @@ define <4 x i8> @load_v4i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: load_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #3] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #1] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [x0] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %load = load <4 x i8>, ptr %a ret <4 x i8> %load @@ -75,11 +84,14 @@ define <2 x i16> @load_v2i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: load_v2i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [x0, #2] +; NONEON-NOSVE-NEXT: str w8, [sp, #12] ; NONEON-NOSVE-NEXT: ldrh w8, [x0] -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: add x8, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %load = load <2 x i16>, ptr %a ret <2 x i16> %load @@ -93,7 +105,12 @@ define <2 x half> @load_v2f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: load_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %load = load <2 x half>, ptr %a ret <2 x half> %load diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll index 7abe73f08dfd65..9e1edb817c459a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll @@ -21,10 +21,17 @@ define i8 @andv_v4i8(<4 x i8> %a) { ; ; NONEON-NOSVE-LABEL: andv_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 -; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: and w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %a) ret i8 %res @@ -41,11 +48,25 @@ define i8 @andv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: andv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #14] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: and w12, w13, w12 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: and w10, w12, w10 +; NONEON-NOSVE-NEXT: and w8, w8, w14 +; NONEON-NOSVE-NEXT: and w8, w10, w8 ; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a) ret i8 %res @@ -64,13 +85,37 @@ define i8 @andv_v16i8(<16 x i8> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 -; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #2] +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: and w11, w14, w13 +; NONEON-NOSVE-NEXT: and w9, w12, w9 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #7] +; NONEON-NOSVE-NEXT: and w10, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #13] +; NONEON-NOSVE-NEXT: and w9, w9, w16 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #9] +; NONEON-NOSVE-NEXT: and w12, w12, w15 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #14] +; NONEON-NOSVE-NEXT: and w8, w13, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #15] +; NONEON-NOSVE-NEXT: and w12, w12, w14 +; NONEON-NOSVE-NEXT: and w8, w8, w11 +; NONEON-NOSVE-NEXT: and w9, w10, w9 +; NONEON-NOSVE-NEXT: and w10, w12, w16 +; NONEON-NOSVE-NEXT: and w8, w8, w15 +; NONEON-NOSVE-NEXT: and w9, w9, w10 +; NONEON-NOSVE-NEXT: and w8, w8, w13 +; NONEON-NOSVE-NEXT: and w0, w9, w8 ; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a) @@ -90,17 +135,72 @@ define i8 @andv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: andv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #2] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #3] +; NONEON-NOSVE-NEXT: and w9, w11, w10 +; NONEON-NOSVE-NEXT: and w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: and w11, w15, w14 +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #4] +; NONEON-NOSVE-NEXT: and w9, w10, w11 +; NONEON-NOSVE-NEXT: and w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #6] +; NONEON-NOSVE-NEXT: and w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: and w10, w14, w10 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #7] +; NONEON-NOSVE-NEXT: and w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w8, w9 +; NONEON-NOSVE-NEXT: and w9, w10, w11 +; NONEON-NOSVE-NEXT: and w10, w14, w13 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #9] +; NONEON-NOSVE-NEXT: and w8, w8, w9 +; NONEON-NOSVE-NEXT: and w11, w15, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #26] +; NONEON-NOSVE-NEXT: and w9, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #10] +; NONEON-NOSVE-NEXT: and w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #12] +; NONEON-NOSVE-NEXT: and w9, w9, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #14] +; NONEON-NOSVE-NEXT: and w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #15] +; NONEON-NOSVE-NEXT: and w10, w13, w10 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #13] +; NONEON-NOSVE-NEXT: and w14, w15, w14 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #30] +; NONEON-NOSVE-NEXT: and w9, w9, w14 +; NONEON-NOSVE-NEXT: and w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #31] +; NONEON-NOSVE-NEXT: and w8, w8, w9 +; NONEON-NOSVE-NEXT: and w10, w10, w12 +; NONEON-NOSVE-NEXT: and w11, w16, w11 +; NONEON-NOSVE-NEXT: and w10, w10, w11 +; NONEON-NOSVE-NEXT: and w11, w17, w13 +; NONEON-NOSVE-NEXT: and w9, w10, w11 ; NONEON-NOSVE-NEXT: and w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op) @@ -118,9 +218,12 @@ define i16 @andv_v2i16(<2 x i16> %a) { ; ; NONEON-NOSVE-LABEL: andv_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %a) ret i16 %res @@ -137,10 +240,17 @@ define i16 @andv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: andv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 -; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: and w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a) ret i16 %res @@ -159,11 +269,20 @@ define i16 @andv_v8i16(<8 x i16> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w13, [sp] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: and w12, w13, w12 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: and w10, w12, w10 +; NONEON-NOSVE-NEXT: and w8, w8, w14 +; NONEON-NOSVE-NEXT: and w8, w10, w8 ; NONEON-NOSVE-NEXT: and w0, w8, w9 ; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret @@ -184,16 +303,40 @@ define i16 @andv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: andv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: and x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #6] +; NONEON-NOSVE-NEXT: and w9, w11, w10 +; NONEON-NOSVE-NEXT: and w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: and w13, w15, w14 +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #8] +; NONEON-NOSVE-NEXT: and w9, w12, w13 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #12] +; NONEON-NOSVE-NEXT: and w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #14] +; NONEON-NOSVE-NEXT: and w10, w14, w10 +; NONEON-NOSVE-NEXT: and w11, w15, w11 +; NONEON-NOSVE-NEXT: and w8, w8, w9 +; NONEON-NOSVE-NEXT: and w9, w10, w11 +; NONEON-NOSVE-NEXT: and w8, w8, w9 +; NONEON-NOSVE-NEXT: and w9, w13, w12 ; NONEON-NOSVE-NEXT: and w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op) @@ -211,9 +354,12 @@ define i32 @andv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: andv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: and w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a) ret i32 %res @@ -232,12 +378,11 @@ define i32 @andv_v4i32(<4 x i32> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: and w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp], #16 +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: and w0, w10, w8 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a) ret i32 %res @@ -256,15 +401,20 @@ define i32 @andv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: andv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w10, w8 +; NONEON-NOSVE-NEXT: and w9, w11, w9 +; NONEON-NOSVE-NEXT: and w8, w9, w8 +; NONEON-NOSVE-NEXT: and w10, w14, w12 +; NONEON-NOSVE-NEXT: and w11, w15, w13 +; NONEON-NOSVE-NEXT: and w9, w10, w11 ; NONEON-NOSVE-NEXT: and w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op) @@ -284,10 +434,8 @@ define i64 @andv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: and x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a) ret i64 %res @@ -306,13 +454,13 @@ define i64 @andv_v4i64(ptr %a) { ; NONEON-NOSVE-LABEL: andv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: and v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp], #32 +; NONEON-NOSVE-NEXT: and x8, x10, x8 +; NONEON-NOSVE-NEXT: and x9, x11, x9 +; NONEON-NOSVE-NEXT: and x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op) @@ -334,10 +482,17 @@ define i8 @eorv_v4i8(<4 x i8> %a) { ; ; NONEON-NOSVE-LABEL: eorv_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 -; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: eor w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %a) ret i8 %res @@ -354,11 +509,25 @@ define i8 @eorv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: eorv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #14] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: eor w12, w13, w12 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: eor w10, w12, w10 +; NONEON-NOSVE-NEXT: eor w8, w8, w14 +; NONEON-NOSVE-NEXT: eor w8, w10, w8 ; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a) ret i8 %res @@ -377,13 +546,37 @@ define i8 @eorv_v16i8(<16 x i8> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 -; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #2] +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: eor w11, w14, w13 +; NONEON-NOSVE-NEXT: eor w9, w12, w9 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #7] +; NONEON-NOSVE-NEXT: eor w10, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #13] +; NONEON-NOSVE-NEXT: eor w9, w9, w16 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #9] +; NONEON-NOSVE-NEXT: eor w12, w12, w15 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #14] +; NONEON-NOSVE-NEXT: eor w8, w13, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #15] +; NONEON-NOSVE-NEXT: eor w12, w12, w14 +; NONEON-NOSVE-NEXT: eor w8, w8, w11 +; NONEON-NOSVE-NEXT: eor w9, w10, w9 +; NONEON-NOSVE-NEXT: eor w10, w12, w16 +; NONEON-NOSVE-NEXT: eor w8, w8, w15 +; NONEON-NOSVE-NEXT: eor w9, w9, w10 +; NONEON-NOSVE-NEXT: eor w8, w8, w13 +; NONEON-NOSVE-NEXT: eor w0, w9, w8 ; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a) @@ -403,17 +596,72 @@ define i8 @eorv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: eorv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #2] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #3] +; NONEON-NOSVE-NEXT: eor w9, w11, w10 +; NONEON-NOSVE-NEXT: eor w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: eor w11, w15, w14 +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #4] +; NONEON-NOSVE-NEXT: eor w9, w10, w11 +; NONEON-NOSVE-NEXT: eor w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #6] +; NONEON-NOSVE-NEXT: eor w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: eor w10, w14, w10 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #7] +; NONEON-NOSVE-NEXT: eor w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #24] +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: eor w9, w10, w11 +; NONEON-NOSVE-NEXT: eor w10, w14, w13 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #9] +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: eor w11, w15, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #26] +; NONEON-NOSVE-NEXT: eor w9, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #10] +; NONEON-NOSVE-NEXT: eor w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #12] +; NONEON-NOSVE-NEXT: eor w9, w9, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #14] +; NONEON-NOSVE-NEXT: eor w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #15] +; NONEON-NOSVE-NEXT: eor w10, w13, w10 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #13] +; NONEON-NOSVE-NEXT: eor w14, w15, w14 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #30] +; NONEON-NOSVE-NEXT: eor w9, w9, w14 +; NONEON-NOSVE-NEXT: eor w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #31] +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: eor w10, w10, w12 +; NONEON-NOSVE-NEXT: eor w11, w16, w11 +; NONEON-NOSVE-NEXT: eor w10, w10, w11 +; NONEON-NOSVE-NEXT: eor w11, w17, w13 +; NONEON-NOSVE-NEXT: eor w9, w10, w11 ; NONEON-NOSVE-NEXT: eor w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op) @@ -431,9 +679,12 @@ define i16 @eorv_v2i16(<2 x i16> %a) { ; ; NONEON-NOSVE-LABEL: eorv_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: eor w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %a) ret i16 %res @@ -450,10 +701,17 @@ define i16 @eorv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: eorv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 -; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: eor w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a) ret i16 %res @@ -472,11 +730,20 @@ define i16 @eorv_v8i16(<8 x i16> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w13, [sp] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: eor w12, w13, w12 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: eor w10, w12, w10 +; NONEON-NOSVE-NEXT: eor w8, w8, w14 +; NONEON-NOSVE-NEXT: eor w8, w10, w8 ; NONEON-NOSVE-NEXT: eor w0, w8, w9 ; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret @@ -497,16 +764,40 @@ define i16 @eorv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: eorv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: eor x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #6] +; NONEON-NOSVE-NEXT: eor w9, w11, w10 +; NONEON-NOSVE-NEXT: eor w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: eor w13, w15, w14 +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #8] +; NONEON-NOSVE-NEXT: eor w9, w12, w13 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #12] +; NONEON-NOSVE-NEXT: eor w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #14] +; NONEON-NOSVE-NEXT: eor w10, w14, w10 +; NONEON-NOSVE-NEXT: eor w11, w15, w11 +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: eor w9, w10, w11 +; NONEON-NOSVE-NEXT: eor w8, w8, w9 +; NONEON-NOSVE-NEXT: eor w9, w13, w12 ; NONEON-NOSVE-NEXT: eor w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op) @@ -524,9 +815,12 @@ define i32 @eorv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: eorv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: eor w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: eor w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a) ret i32 %res @@ -545,12 +839,11 @@ define i32 @eorv_v4i32(<4 x i32> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: eor w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp], #16 +; NONEON-NOSVE-NEXT: eor w10, w11, w10 +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: eor w0, w10, w8 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a) ret i32 %res @@ -569,15 +862,20 @@ define i32 @eorv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: eorv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #8] +; NONEON-NOSVE-NEXT: eor w8, w10, w8 +; NONEON-NOSVE-NEXT: eor w9, w11, w9 +; NONEON-NOSVE-NEXT: eor w8, w9, w8 +; NONEON-NOSVE-NEXT: eor w10, w14, w12 +; NONEON-NOSVE-NEXT: eor w11, w15, w13 +; NONEON-NOSVE-NEXT: eor w9, w10, w11 ; NONEON-NOSVE-NEXT: eor w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op) @@ -597,10 +895,8 @@ define i64 @eorv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: eor x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a) ret i64 %res @@ -619,13 +915,13 @@ define i64 @eorv_v4i64(ptr %a) { ; NONEON-NOSVE-LABEL: eorv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: eor v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: eor v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp], #32 +; NONEON-NOSVE-NEXT: eor x8, x10, x8 +; NONEON-NOSVE-NEXT: eor x9, x11, x9 +; NONEON-NOSVE-NEXT: eor x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op) @@ -647,10 +943,17 @@ define i8 @orv_v4i8(<4 x i8> %a) { ; ; NONEON-NOSVE-LABEL: orv_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 -; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: orr w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a) ret i8 %res @@ -667,11 +970,25 @@ define i8 @orv_v8i8(<8 x i8> %a) { ; ; NONEON-NOSVE-LABEL: orv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: orr w12, w13, w12 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: orr w10, w12, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w14 +; NONEON-NOSVE-NEXT: orr w8, w10, w8 ; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a) ret i8 %res @@ -690,13 +1007,37 @@ define i8 @orv_v16i8(<16 x i8> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 -; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #2] +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: orr w11, w14, w13 +; NONEON-NOSVE-NEXT: orr w9, w12, w9 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #7] +; NONEON-NOSVE-NEXT: orr w10, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #13] +; NONEON-NOSVE-NEXT: orr w9, w9, w16 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #9] +; NONEON-NOSVE-NEXT: orr w12, w12, w15 +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #14] +; NONEON-NOSVE-NEXT: orr w8, w13, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #15] +; NONEON-NOSVE-NEXT: orr w12, w12, w14 +; NONEON-NOSVE-NEXT: orr w8, w8, w11 +; NONEON-NOSVE-NEXT: orr w9, w10, w9 +; NONEON-NOSVE-NEXT: orr w10, w12, w16 +; NONEON-NOSVE-NEXT: orr w8, w8, w15 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w13 +; NONEON-NOSVE-NEXT: orr w0, w9, w8 ; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a) @@ -716,17 +1057,72 @@ define i8 @orv_v32i8(ptr %a) { ; NONEON-NOSVE-LABEL: orv_v32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #16 -; NONEON-NOSVE-NEXT: lsr x9, x8, #8 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #2] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #3] +; NONEON-NOSVE-NEXT: orr w9, w11, w10 +; NONEON-NOSVE-NEXT: orr w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: orr w11, w15, w14 +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #4] +; NONEON-NOSVE-NEXT: orr w9, w10, w11 +; NONEON-NOSVE-NEXT: orr w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #6] +; NONEON-NOSVE-NEXT: orr w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #8] +; NONEON-NOSVE-NEXT: orr w10, w14, w10 +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #7] +; NONEON-NOSVE-NEXT: orr w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #24] +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: orr w9, w10, w11 +; NONEON-NOSVE-NEXT: orr w10, w14, w13 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #9] +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: orr w11, w15, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #26] +; NONEON-NOSVE-NEXT: orr w9, w10, w11 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #10] +; NONEON-NOSVE-NEXT: orr w10, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #12] +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w16, [sp, #14] +; NONEON-NOSVE-NEXT: orr w11, w12, w11 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w17, [sp, #15] +; NONEON-NOSVE-NEXT: orr w10, w13, w10 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #13] +; NONEON-NOSVE-NEXT: orr w14, w15, w14 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #30] +; NONEON-NOSVE-NEXT: orr w9, w9, w14 +; NONEON-NOSVE-NEXT: orr w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #31] +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: orr w10, w10, w12 +; NONEON-NOSVE-NEXT: orr w11, w16, w11 +; NONEON-NOSVE-NEXT: orr w10, w10, w11 +; NONEON-NOSVE-NEXT: orr w11, w17, w13 +; NONEON-NOSVE-NEXT: orr w9, w10, w11 ; NONEON-NOSVE-NEXT: orr w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op) @@ -744,9 +1140,12 @@ define i16 @orv_v2i16(<2 x i16> %a) { ; ; NONEON-NOSVE-LABEL: orv_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %a) ret i16 %res @@ -763,10 +1162,17 @@ define i16 @orv_v4i16(<4 x i16> %a) { ; ; NONEON-NOSVE-LABEL: orv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 -; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: orr w0, w10, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a) ret i16 %res @@ -785,11 +1191,20 @@ define i16 @orv_v8i16(<8 x i16> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w13, [sp] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: orr w12, w13, w12 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: orr w10, w12, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w14 +; NONEON-NOSVE-NEXT: orr w8, w10, w8 ; NONEON-NOSVE-NEXT: orr w0, w8, w9 ; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret @@ -810,16 +1225,40 @@ define i16 @orv_v16i16(ptr %a) { ; NONEON-NOSVE-LABEL: orv_v16i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: orr x8, x8, x8, lsr #32 -; NONEON-NOSVE-NEXT: lsr x9, x8, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #4] +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #6] +; NONEON-NOSVE-NEXT: orr w9, w11, w10 +; NONEON-NOSVE-NEXT: orr w12, w13, w12 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: orr w13, w15, w14 +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #8] +; NONEON-NOSVE-NEXT: orr w9, w12, w13 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #12] +; NONEON-NOSVE-NEXT: orr w14, w17, w16 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #14] +; NONEON-NOSVE-NEXT: orr w10, w14, w10 +; NONEON-NOSVE-NEXT: orr w11, w15, w11 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: orr w9, w10, w11 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: orr w9, w13, w12 ; NONEON-NOSVE-NEXT: orr w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op) @@ -837,9 +1276,12 @@ define i32 @orv_v2i32(<2 x i32> %a) { ; ; NONEON-NOSVE-LABEL: orv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: orr w0, w8, w9 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: orr w0, w9, w8 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a) ret i32 %res @@ -858,12 +1300,11 @@ define i32 @orv_v4i32(<4 x i32> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 -; NONEON-NOSVE-NEXT: orr w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp], #16 +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: orr w0, w10, w8 ; NONEON-NOSVE-NEXT: ret %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a) ret i32 %res @@ -882,15 +1323,20 @@ define i32 @orv_v8i32(ptr %a) { ; NONEON-NOSVE-LABEL: orv_v8i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x8, d0 -; NONEON-NOSVE-NEXT: lsr x9, x8, #32 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w11, w10, [sp] +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #8] +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: orr w9, w11, w9 +; NONEON-NOSVE-NEXT: orr w8, w9, w8 +; NONEON-NOSVE-NEXT: orr w10, w14, w12 +; NONEON-NOSVE-NEXT: orr w11, w15, w13 +; NONEON-NOSVE-NEXT: orr w9, w10, w11 ; NONEON-NOSVE-NEXT: orr w0, w8, w9 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op) @@ -910,10 +1356,8 @@ define i64 @orv_v2i64(<2 x i64> %a) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! ; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp], #16 +; NONEON-NOSVE-NEXT: orr x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a) ret i64 %res @@ -932,13 +1376,13 @@ define i64 @orv_v4i64(ptr %a) { ; NONEON-NOSVE-LABEL: orv_v4i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: orr v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 -; NONEON-NOSVE-NEXT: ldr d1, [sp, #8] -; NONEON-NOSVE-NEXT: orr v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: fmov x0, d0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp], #32 +; NONEON-NOSVE-NEXT: orr x8, x10, x8 +; NONEON-NOSVE-NEXT: orr x9, x11, x9 +; NONEON-NOSVE-NEXT: orr x0, x9, x8 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll index 6c33613f8e757d..be335c697707de 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-load.ll @@ -23,40 +23,83 @@ define <4 x i8> @masked_load_v4i8(ptr %src, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI0_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: str d0, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #116] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #118] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #112] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 ; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB0_2 ; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[0], [x0] +; NONEON-NOSVE-NEXT: ldrb w9, [x0] +; NONEON-NOSVE-NEXT: strh wzr, [sp, #110] +; NONEON-NOSVE-NEXT: stur wzr, [sp, #106] +; NONEON-NOSVE-NEXT: strh w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #104] ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB0_3 ; NONEON-NOSVE-NEXT: b .LBB0_4 ; NONEON-NOSVE-NEXT: .LBB0_2: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI0_0 +; NONEON-NOSVE-NEXT: ldr d0, [x9, :lo12:.LCPI0_0] ; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB0_4 ; NONEON-NOSVE-NEXT: .LBB0_3: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #1 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[2], [x9] +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #1] +; NONEON-NOSVE-NEXT: str d0, [sp, #80] +; NONEON-NOSVE-NEXT: strh w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #84] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #96] +; NONEON-NOSVE-NEXT: str w9, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #80] +; NONEON-NOSVE-NEXT: str d0, [sp, #72] +; NONEON-NOSVE-NEXT: strh w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #72] +; NONEON-NOSVE-NEXT: strh w9, [sp, #90] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #88] ; NONEON-NOSVE-NEXT: .LBB0_4: // %else2 ; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB0_7 ; NONEON-NOSVE-NEXT: // %bb.5: // %else5 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB0_8 ; NONEON-NOSVE-NEXT: .LBB0_6: // %else8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB0_7: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[4], [x9] +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #2] +; NONEON-NOSVE-NEXT: str d0, [sp, #48] +; NONEON-NOSVE-NEXT: strh w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: str d0, [sp, #40] +; NONEON-NOSVE-NEXT: str w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] ; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB0_6 ; NONEON-NOSVE-NEXT: .LBB0_8: // %cond.load7 -; NONEON-NOSVE-NEXT: add x8, x0, #3 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[6], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #3] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %load = call <4 x i8> @llvm.masked.load.v4i8(ptr %src, i32 8, <4 x i1> %mask, <4 x i8> zeroinitializer) ret <4 x i8> %load @@ -76,64 +119,183 @@ define <8 x i8> @masked_load_v8i8(ptr %src, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI1_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI1_0] -; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w8, s0 -; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB1_2 +; NONEON-NOSVE-NEXT: sub sp, sp, #272 +; NONEON-NOSVE-NEXT: str x29, [sp, #256] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 272 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str d0, [sp, #240] +; NONEON-NOSVE-NEXT: add x9, sp, #176 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #242] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #243] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #241] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #244] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #245] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #246] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #240] +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w8, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #247] +; NONEON-NOSVE-NEXT: and w11, w11, #0x2 +; NONEON-NOSVE-NEXT: and w13, w13, #0x10 +; NONEON-NOSVE-NEXT: bfxil w11, w12, #0, #1 +; NONEON-NOSVE-NEXT: and w12, w14, #0x20 +; NONEON-NOSVE-NEXT: orr w8, w8, w13 +; NONEON-NOSVE-NEXT: and w13, w15, #0x40 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w11, w8 +; NONEON-NOSVE-NEXT: orr w11, w12, w13 +; NONEON-NOSVE-NEXT: orr w8, w8, w11 +; NONEON-NOSVE-NEXT: and w10, w10, #0x80 +; NONEON-NOSVE-NEXT: add w10, w8, w10 +; NONEON-NOSVE-NEXT: and w8, w10, #0xff +; NONEON-NOSVE-NEXT: tbz w10, #0, .LBB1_2 ; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load -; NONEON-NOSVE-NEXT: ldr b0, [x0] +; NONEON-NOSVE-NEXT: ldrb w10, [x0] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #239] +; NONEON-NOSVE-NEXT: sturh wzr, [x9, #61] +; NONEON-NOSVE-NEXT: stur wzr, [x9, #57] +; NONEON-NOSVE-NEXT: strb w10, [sp, #232] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #232] ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB1_3 ; NONEON-NOSVE-NEXT: b .LBB1_4 ; NONEON-NOSVE-NEXT: .LBB1_2: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x10, .LCPI1_0 +; NONEON-NOSVE-NEXT: ldr d0, [x10, :lo12:.LCPI1_0] ; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB1_4 ; NONEON-NOSVE-NEXT: .LBB1_3: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #1 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[1], [x9] +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #1] +; NONEON-NOSVE-NEXT: str d0, [sp, #208] +; NONEON-NOSVE-NEXT: strb w10, [sp, #224] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #214] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #224] +; NONEON-NOSVE-NEXT: strh w10, [sp, #222] +; NONEON-NOSVE-NEXT: str d0, [sp, #200] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #34] +; NONEON-NOSVE-NEXT: stur w10, [x9, #42] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #208] +; NONEON-NOSVE-NEXT: strb w10, [sp, #216] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #200] +; NONEON-NOSVE-NEXT: strb w10, [sp, #217] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #216] ; NONEON-NOSVE-NEXT: .LBB1_4: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB1_11 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB1_12 ; NONEON-NOSVE-NEXT: // %bb.5: // %else5 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB1_12 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB1_13 ; NONEON-NOSVE-NEXT: .LBB1_6: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB1_13 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB1_14 ; NONEON-NOSVE-NEXT: .LBB1_7: // %else11 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB1_14 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB1_15 ; NONEON-NOSVE-NEXT: .LBB1_8: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB1_15 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB1_16 ; NONEON-NOSVE-NEXT: .LBB1_9: // %else17 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB1_16 -; NONEON-NOSVE-NEXT: .LBB1_10: // %else20 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB1_11 +; NONEON-NOSVE-NEXT: .LBB1_10: // %cond.load19 +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #7] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: .LBB1_11: // %else20 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #256] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #272 ; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB1_11: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[2], [x9] +; NONEON-NOSVE-NEXT: .LBB1_12: // %cond.load4 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #2] +; NONEON-NOSVE-NEXT: str d0, [sp, #176] +; NONEON-NOSVE-NEXT: strb w10, [sp, #192] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #183] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #192] +; NONEON-NOSVE-NEXT: strb w10, [sp, #191] +; NONEON-NOSVE-NEXT: str d0, [sp, #168] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #3] +; NONEON-NOSVE-NEXT: stur w10, [x9, #11] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #176] +; NONEON-NOSVE-NEXT: strh w9, [sp, #184] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #168] +; NONEON-NOSVE-NEXT: strb w9, [sp, #186] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #184] ; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB1_6 -; NONEON-NOSVE-NEXT: .LBB1_12: // %cond.load7 -; NONEON-NOSVE-NEXT: add x9, x0, #3 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[3], [x9] +; NONEON-NOSVE-NEXT: .LBB1_13: // %cond.load7 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #3] +; NONEON-NOSVE-NEXT: str d0, [sp, #144] +; NONEON-NOSVE-NEXT: strb w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #148] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #160] +; NONEON-NOSVE-NEXT: str w9, [sp, #156] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #146] +; NONEON-NOSVE-NEXT: str d0, [sp, #136] +; NONEON-NOSVE-NEXT: strb w9, [sp, #154] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #144] +; NONEON-NOSVE-NEXT: strh w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #136] +; NONEON-NOSVE-NEXT: strb w9, [sp, #155] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #152] ; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB1_7 -; NONEON-NOSVE-NEXT: .LBB1_13: // %cond.load10 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[4], [x9] +; NONEON-NOSVE-NEXT: .LBB1_14: // %cond.load10 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #4] +; NONEON-NOSVE-NEXT: str d0, [sp, #112] +; NONEON-NOSVE-NEXT: strb w9, [sp, #128] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #119] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #128] +; NONEON-NOSVE-NEXT: strb w9, [sp, #127] +; NONEON-NOSVE-NEXT: ldurh w9, [sp, #117] +; NONEON-NOSVE-NEXT: str d0, [sp, #104] +; NONEON-NOSVE-NEXT: sturh w9, [sp, #125] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #112] +; NONEON-NOSVE-NEXT: str w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #104] +; NONEON-NOSVE-NEXT: strb w9, [sp, #124] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #120] ; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB1_8 -; NONEON-NOSVE-NEXT: .LBB1_14: // %cond.load13 -; NONEON-NOSVE-NEXT: add x9, x0, #5 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[5], [x9] +; NONEON-NOSVE-NEXT: .LBB1_15: // %cond.load13 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #5] +; NONEON-NOSVE-NEXT: str d0, [sp, #80] +; NONEON-NOSVE-NEXT: strb w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #86] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #96] +; NONEON-NOSVE-NEXT: strh w9, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #84] +; NONEON-NOSVE-NEXT: str d0, [sp, #72] +; NONEON-NOSVE-NEXT: strb w9, [sp, #92] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #80] +; NONEON-NOSVE-NEXT: str w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #72] +; NONEON-NOSVE-NEXT: strb w9, [sp, #93] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #88] ; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB1_9 -; NONEON-NOSVE-NEXT: .LBB1_15: // %cond.load16 -; NONEON-NOSVE-NEXT: add x9, x0, #6 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB1_10 -; NONEON-NOSVE-NEXT: .LBB1_16: // %cond.load19 -; NONEON-NOSVE-NEXT: add x8, x0, #7 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[7], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 -; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB1_16: // %cond.load16 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #6] +; NONEON-NOSVE-NEXT: str d0, [sp, #48] +; NONEON-NOSVE-NEXT: strb w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #55] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] +; NONEON-NOSVE-NEXT: strb w9, [sp, #63] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: str d0, [sp, #40] +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: str w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w9, [sp, #62] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB1_10 +; NONEON-NOSVE-NEXT: b .LBB1_11 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %src, i32 8, <8 x i1> %mask, <8 x i8> zeroinitializer) ret <8 x i8> %load } @@ -152,112 +314,413 @@ define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI2_0 -; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #1024 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 1040 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str q0, [sp, #976] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #984] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1000] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #976] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #992] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #991] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1007] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #990] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1006] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #989] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1005] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #988] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1004] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #987] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1003] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #986] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1002] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #985] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1001] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #983] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #999] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #982] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #998] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #981] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #997] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #980] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #996] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #979] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #995] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #978] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #994] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #977] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #993] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #992] ; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: addv h1, v0.8h -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB2_17 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB2_18 -; NONEON-NOSVE-NEXT: .LBB2_2: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB2_19 -; NONEON-NOSVE-NEXT: .LBB2_3: // %else5 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB2_20 -; NONEON-NOSVE-NEXT: .LBB2_4: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB2_21 -; NONEON-NOSVE-NEXT: .LBB2_5: // %else11 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB2_22 -; NONEON-NOSVE-NEXT: .LBB2_6: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB2_23 -; NONEON-NOSVE-NEXT: .LBB2_7: // %else17 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB2_24 -; NONEON-NOSVE-NEXT: .LBB2_8: // %else20 -; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB2_25 -; NONEON-NOSVE-NEXT: .LBB2_9: // %else23 -; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB2_26 -; NONEON-NOSVE-NEXT: .LBB2_10: // %else26 -; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB2_27 -; NONEON-NOSVE-NEXT: .LBB2_11: // %else29 -; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB2_28 -; NONEON-NOSVE-NEXT: .LBB2_12: // %else32 -; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB2_29 -; NONEON-NOSVE-NEXT: .LBB2_13: // %else35 -; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB2_30 -; NONEON-NOSVE-NEXT: .LBB2_14: // %else38 -; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB2_31 -; NONEON-NOSVE-NEXT: .LBB2_15: // %else41 -; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB2_32 -; NONEON-NOSVE-NEXT: .LBB2_16: // %else44 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB2_17: // %cond.load -; NONEON-NOSVE-NEXT: ldr b0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB2_2 -; NONEON-NOSVE-NEXT: .LBB2_18: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #1 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB2_3 -; NONEON-NOSVE-NEXT: .LBB2_19: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB2_4 -; NONEON-NOSVE-NEXT: .LBB2_20: // %cond.load7 -; NONEON-NOSVE-NEXT: add x9, x0, #3 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB2_5 -; NONEON-NOSVE-NEXT: .LBB2_21: // %cond.load10 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[4], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB2_6 -; NONEON-NOSVE-NEXT: .LBB2_22: // %cond.load13 -; NONEON-NOSVE-NEXT: add x9, x0, #5 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[5], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB2_7 -; NONEON-NOSVE-NEXT: .LBB2_23: // %cond.load16 -; NONEON-NOSVE-NEXT: add x9, x0, #6 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB2_8 -; NONEON-NOSVE-NEXT: .LBB2_24: // %cond.load19 -; NONEON-NOSVE-NEXT: add x9, x0, #7 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[7], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB2_9 -; NONEON-NOSVE-NEXT: .LBB2_25: // %cond.load22 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[8], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB2_10 -; NONEON-NOSVE-NEXT: .LBB2_26: // %cond.load25 -; NONEON-NOSVE-NEXT: add x9, x0, #9 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[9], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB2_11 -; NONEON-NOSVE-NEXT: .LBB2_27: // %cond.load28 -; NONEON-NOSVE-NEXT: add x9, x0, #10 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[10], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB2_12 -; NONEON-NOSVE-NEXT: .LBB2_28: // %cond.load31 -; NONEON-NOSVE-NEXT: add x9, x0, #11 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[11], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB2_13 -; NONEON-NOSVE-NEXT: .LBB2_29: // %cond.load34 -; NONEON-NOSVE-NEXT: add x9, x0, #12 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[12], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB2_14 -; NONEON-NOSVE-NEXT: .LBB2_30: // %cond.load37 -; NONEON-NOSVE-NEXT: add x9, x0, #13 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[13], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB2_15 -; NONEON-NOSVE-NEXT: .LBB2_31: // %cond.load40 -; NONEON-NOSVE-NEXT: add x9, x0, #14 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[14], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB2_16 -; NONEON-NOSVE-NEXT: .LBB2_32: // %cond.load43 -; NONEON-NOSVE-NEXT: add x8, x0, #15 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[15], [x8] +; NONEON-NOSVE-NEXT: str q0, [sp, #1008] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #1010] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #1008] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1012] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #1014] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #1016] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #1018] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #1020] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w12, w13 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w10, w14 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1022] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add x9, sp, #720 +; NONEON-NOSVE-NEXT: add w8, w8, w10 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB2_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: ldrb w10, [x0] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #975] +; NONEON-NOSVE-NEXT: sturh wzr, [x9, #253] +; NONEON-NOSVE-NEXT: stur wzr, [x9, #249] +; NONEON-NOSVE-NEXT: stur xzr, [x9, #241] +; NONEON-NOSVE-NEXT: strb w10, [sp, #960] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #960] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB2_3 +; NONEON-NOSVE-NEXT: b .LBB2_4 +; NONEON-NOSVE-NEXT: .LBB2_2: +; NONEON-NOSVE-NEXT: adrp x10, .LCPI2_0 +; NONEON-NOSVE-NEXT: ldr q0, [x10, :lo12:.LCPI2_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB2_4 +; NONEON-NOSVE-NEXT: .LBB2_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #1] +; NONEON-NOSVE-NEXT: str q0, [sp, #912] +; NONEON-NOSVE-NEXT: strb w10, [sp, #944] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #926] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #944] +; NONEON-NOSVE-NEXT: strh w10, [sp, #942] +; NONEON-NOSVE-NEXT: str q0, [sp, #896] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #202] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #194] +; NONEON-NOSVE-NEXT: stur w10, [x9, #218] +; NONEON-NOSVE-NEXT: stur x11, [x9, #210] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #912] +; NONEON-NOSVE-NEXT: strb w10, [sp, #928] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #896] +; NONEON-NOSVE-NEXT: strb w10, [sp, #929] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #928] +; NONEON-NOSVE-NEXT: .LBB2_4: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB2_20 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB2_21 +; NONEON-NOSVE-NEXT: .LBB2_6: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB2_22 +; NONEON-NOSVE-NEXT: .LBB2_7: // %else11 +; NONEON-NOSVE-NEXT: add x9, sp, #464 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB2_23 +; NONEON-NOSVE-NEXT: .LBB2_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB2_24 +; NONEON-NOSVE-NEXT: .LBB2_9: // %else17 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB2_25 +; NONEON-NOSVE-NEXT: .LBB2_10: // %else20 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB2_26 +; NONEON-NOSVE-NEXT: .LBB2_11: // %else23 +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB2_27 +; NONEON-NOSVE-NEXT: .LBB2_12: // %else26 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB2_28 +; NONEON-NOSVE-NEXT: .LBB2_13: // %else29 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB2_29 +; NONEON-NOSVE-NEXT: .LBB2_14: // %else32 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB2_30 +; NONEON-NOSVE-NEXT: .LBB2_15: // %else35 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB2_31 +; NONEON-NOSVE-NEXT: .LBB2_16: // %else38 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB2_32 +; NONEON-NOSVE-NEXT: .LBB2_17: // %else41 +; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB2_19 +; NONEON-NOSVE-NEXT: .LBB2_18: // %cond.load43 +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #15] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: .LBB2_19: // %else44 +; NONEON-NOSVE-NEXT: add sp, sp, #1024 +; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB2_20: // %cond.load4 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #2] +; NONEON-NOSVE-NEXT: str q0, [sp, #848] +; NONEON-NOSVE-NEXT: strb w10, [sp, #880] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #863] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #880] +; NONEON-NOSVE-NEXT: strb w10, [sp, #879] +; NONEON-NOSVE-NEXT: str q0, [sp, #832] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #139] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #131] +; NONEON-NOSVE-NEXT: stur w10, [x9, #155] +; NONEON-NOSVE-NEXT: stur x11, [x9, #147] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #848] +; NONEON-NOSVE-NEXT: strh w10, [sp, #864] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #832] +; NONEON-NOSVE-NEXT: strb w10, [sp, #866] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #864] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB2_6 +; NONEON-NOSVE-NEXT: .LBB2_21: // %cond.load7 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #3] +; NONEON-NOSVE-NEXT: str q0, [sp, #784] +; NONEON-NOSVE-NEXT: strb w10, [sp, #816] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #796] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #816] +; NONEON-NOSVE-NEXT: str w10, [sp, #812] +; NONEON-NOSVE-NEXT: str q0, [sp, #768] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #68] +; NONEON-NOSVE-NEXT: stur x10, [x9, #84] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #786] +; NONEON-NOSVE-NEXT: strb w10, [sp, #802] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #784] +; NONEON-NOSVE-NEXT: strh w10, [sp, #800] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #768] +; NONEON-NOSVE-NEXT: strb w10, [sp, #803] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #800] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB2_7 +; NONEON-NOSVE-NEXT: .LBB2_22: // %cond.load10 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #4] +; NONEON-NOSVE-NEXT: str q0, [sp, #720] +; NONEON-NOSVE-NEXT: strb w10, [sp, #752] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #735] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #752] +; NONEON-NOSVE-NEXT: strb w10, [sp, #751] +; NONEON-NOSVE-NEXT: str q0, [sp, #704] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #5] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: stur x11, [x9, #21] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #720] +; NONEON-NOSVE-NEXT: str w9, [sp, #736] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #704] +; NONEON-NOSVE-NEXT: strb w9, [sp, #740] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #736] +; NONEON-NOSVE-NEXT: add x9, sp, #464 +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB2_8 +; NONEON-NOSVE-NEXT: .LBB2_23: // %cond.load13 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #5] +; NONEON-NOSVE-NEXT: str q0, [sp, #656] +; NONEON-NOSVE-NEXT: strb w10, [sp, #688] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #670] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #688] +; NONEON-NOSVE-NEXT: strh w10, [sp, #686] +; NONEON-NOSVE-NEXT: str q0, [sp, #640] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #198] +; NONEON-NOSVE-NEXT: stur x10, [x9, #214] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #660] +; NONEON-NOSVE-NEXT: strb w10, [sp, #676] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #656] +; NONEON-NOSVE-NEXT: str w10, [sp, #672] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #640] +; NONEON-NOSVE-NEXT: strb w10, [sp, #677] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #672] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB2_9 +; NONEON-NOSVE-NEXT: .LBB2_24: // %cond.load16 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #6] +; NONEON-NOSVE-NEXT: str q0, [sp, #592] +; NONEON-NOSVE-NEXT: strb w10, [sp, #624] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #607] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #624] +; NONEON-NOSVE-NEXT: strb w10, [sp, #623] +; NONEON-NOSVE-NEXT: str q0, [sp, #576] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #135] +; NONEON-NOSVE-NEXT: stur x10, [x9, #151] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #596] +; NONEON-NOSVE-NEXT: strh w10, [sp, #612] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #592] +; NONEON-NOSVE-NEXT: str w10, [sp, #608] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #576] +; NONEON-NOSVE-NEXT: strb w10, [sp, #614] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #608] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB2_10 +; NONEON-NOSVE-NEXT: .LBB2_25: // %cond.load19 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #7] +; NONEON-NOSVE-NEXT: str q0, [sp, #528] +; NONEON-NOSVE-NEXT: strb w10, [sp, #560] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #536] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #560] +; NONEON-NOSVE-NEXT: str x10, [sp, #552] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #534] +; NONEON-NOSVE-NEXT: str q0, [sp, #512] +; NONEON-NOSVE-NEXT: strb w10, [sp, #550] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #532] +; NONEON-NOSVE-NEXT: strh w10, [sp, #548] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #528] +; NONEON-NOSVE-NEXT: str w10, [sp, #544] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #512] +; NONEON-NOSVE-NEXT: strb w10, [sp, #551] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #544] +; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB2_11 +; NONEON-NOSVE-NEXT: .LBB2_26: // %cond.load22 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #464] +; NONEON-NOSVE-NEXT: strb w10, [sp, #496] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #479] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #496] +; NONEON-NOSVE-NEXT: strb w10, [sp, #495] +; NONEON-NOSVE-NEXT: str q0, [sp, #448] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: ldur w11, [x9, #9] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: stur w11, [x9, #25] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #464] +; NONEON-NOSVE-NEXT: str x9, [sp, #480] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #448] +; NONEON-NOSVE-NEXT: strb w9, [sp, #488] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #480] +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB2_12 +; NONEON-NOSVE-NEXT: .LBB2_27: // %cond.load25 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #9] +; NONEON-NOSVE-NEXT: str q0, [sp, #400] +; NONEON-NOSVE-NEXT: strb w10, [sp, #432] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #414] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #432] +; NONEON-NOSVE-NEXT: strh w10, [sp, #430] +; NONEON-NOSVE-NEXT: str q0, [sp, #384] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #202] +; NONEON-NOSVE-NEXT: stur w10, [x9, #218] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #408] +; NONEON-NOSVE-NEXT: strb w10, [sp, #424] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #400] +; NONEON-NOSVE-NEXT: str x10, [sp, #416] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #384] +; NONEON-NOSVE-NEXT: strb w10, [sp, #425] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #416] +; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB2_13 +; NONEON-NOSVE-NEXT: .LBB2_28: // %cond.load28 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #10] +; NONEON-NOSVE-NEXT: str q0, [sp, #336] +; NONEON-NOSVE-NEXT: strb w10, [sp, #368] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #351] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #368] +; NONEON-NOSVE-NEXT: strb w10, [sp, #367] +; NONEON-NOSVE-NEXT: str q0, [sp, #320] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #139] +; NONEON-NOSVE-NEXT: stur w10, [x9, #155] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #344] +; NONEON-NOSVE-NEXT: strh w10, [sp, #360] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #336] +; NONEON-NOSVE-NEXT: str x10, [sp, #352] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #320] +; NONEON-NOSVE-NEXT: strb w10, [sp, #362] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #352] +; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB2_14 +; NONEON-NOSVE-NEXT: .LBB2_29: // %cond.load31 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #11] +; NONEON-NOSVE-NEXT: str q0, [sp, #272] +; NONEON-NOSVE-NEXT: strb w10, [sp, #304] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #284] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #304] +; NONEON-NOSVE-NEXT: str w10, [sp, #300] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #282] +; NONEON-NOSVE-NEXT: str q0, [sp, #256] +; NONEON-NOSVE-NEXT: strb w10, [sp, #298] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #280] +; NONEON-NOSVE-NEXT: strh w10, [sp, #296] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #272] +; NONEON-NOSVE-NEXT: str x10, [sp, #288] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #256] +; NONEON-NOSVE-NEXT: strb w10, [sp, #299] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #288] +; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB2_15 +; NONEON-NOSVE-NEXT: .LBB2_30: // %cond.load34 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #208] +; NONEON-NOSVE-NEXT: strb w10, [sp, #240] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #223] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #240] +; NONEON-NOSVE-NEXT: strb w10, [sp, #239] +; NONEON-NOSVE-NEXT: str q0, [sp, #192] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #216] +; NONEON-NOSVE-NEXT: str w9, [sp, #232] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #208] +; NONEON-NOSVE-NEXT: str x9, [sp, #224] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #192] +; NONEON-NOSVE-NEXT: strb w9, [sp, #236] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #224] +; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB2_16 +; NONEON-NOSVE-NEXT: .LBB2_31: // %cond.load37 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #13] +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: strb w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #158] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #176] +; NONEON-NOSVE-NEXT: strh w9, [sp, #174] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #156] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: strb w9, [sp, #172] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #152] +; NONEON-NOSVE-NEXT: str w9, [sp, #168] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #144] +; NONEON-NOSVE-NEXT: str x9, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #128] +; NONEON-NOSVE-NEXT: strb w9, [sp, #173] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #160] +; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB2_17 +; NONEON-NOSVE-NEXT: .LBB2_32: // %cond.load40 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #14] +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: strb w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #95] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #112] +; NONEON-NOSVE-NEXT: strb w9, [sp, #111] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #92] +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #108] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #88] +; NONEON-NOSVE-NEXT: str w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #80] +; NONEON-NOSVE-NEXT: str x9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #64] +; NONEON-NOSVE-NEXT: strb w9, [sp, #110] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #96] +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB2_18 +; NONEON-NOSVE-NEXT: b .LBB2_19 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %src, i32 8, <16 x i1> %mask, <16 x i8> zeroinitializer) ret <16 x i8> %load } @@ -342,274 +805,815 @@ define <32 x i8> @masked_load_v32i8(ptr %src, <32 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] -; NONEON-NOSVE-NEXT: fmov s1, w1 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #80] -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] -; NONEON-NOSVE-NEXT: mov v1.b[1], w2 -; NONEON-NOSVE-NEXT: mov v0.b[1], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp] -; NONEON-NOSVE-NEXT: mov v1.b[2], w3 -; NONEON-NOSVE-NEXT: mov v0.b[2], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] -; NONEON-NOSVE-NEXT: mov v1.b[3], w4 -; NONEON-NOSVE-NEXT: mov v0.b[3], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] -; NONEON-NOSVE-NEXT: mov v1.b[4], w5 -; NONEON-NOSVE-NEXT: mov v0.b[4], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] -; NONEON-NOSVE-NEXT: mov v1.b[5], w6 -; NONEON-NOSVE-NEXT: mov v0.b[5], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] -; NONEON-NOSVE-NEXT: mov v1.b[6], w7 -; NONEON-NOSVE-NEXT: mov v0.b[6], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] -; NONEON-NOSVE-NEXT: mov v1.b[7], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] -; NONEON-NOSVE-NEXT: mov v0.b[7], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] -; NONEON-NOSVE-NEXT: mov v1.b[8], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] -; NONEON-NOSVE-NEXT: mov v0.b[8], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] -; NONEON-NOSVE-NEXT: mov v1.b[9], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] -; NONEON-NOSVE-NEXT: mov v0.b[9], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #152] -; NONEON-NOSVE-NEXT: mov v1.b[10], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] -; NONEON-NOSVE-NEXT: mov v0.b[10], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] -; NONEON-NOSVE-NEXT: mov v1.b[11], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #40] -; NONEON-NOSVE-NEXT: mov v0.b[11], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #168] -; NONEON-NOSVE-NEXT: mov v1.b[12], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] -; NONEON-NOSVE-NEXT: mov v0.b[12], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] -; NONEON-NOSVE-NEXT: mov v1.b[13], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #56] -; NONEON-NOSVE-NEXT: mov v0.b[13], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #184] -; NONEON-NOSVE-NEXT: mov v1.b[14], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #64] -; NONEON-NOSVE-NEXT: mov v0.b[14], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] -; NONEON-NOSVE-NEXT: mov v1.b[15], w9 -; NONEON-NOSVE-NEXT: mov v0.b[15], w8 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI3_0 -; NONEON-NOSVE-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] -; NONEON-NOSVE-NEXT: shl v1.16b, v1.16b, #7 -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: cmlt v1.16b, v1.16b, #0 -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; NONEON-NOSVE-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; NONEON-NOSVE-NEXT: zip1 v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: addv h1, v1.8h -; NONEON-NOSVE-NEXT: addv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: movi v1.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: bfi w8, w9, #16, #16 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB3_33 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_34 -; NONEON-NOSVE-NEXT: .LBB3_2: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_35 -; NONEON-NOSVE-NEXT: .LBB3_3: // %else5 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_36 -; NONEON-NOSVE-NEXT: .LBB3_4: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_37 -; NONEON-NOSVE-NEXT: .LBB3_5: // %else11 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_38 -; NONEON-NOSVE-NEXT: .LBB3_6: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_39 -; NONEON-NOSVE-NEXT: .LBB3_7: // %else17 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_40 -; NONEON-NOSVE-NEXT: .LBB3_8: // %else20 -; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_41 -; NONEON-NOSVE-NEXT: .LBB3_9: // %else23 -; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_42 -; NONEON-NOSVE-NEXT: .LBB3_10: // %else26 -; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_43 -; NONEON-NOSVE-NEXT: .LBB3_11: // %else29 -; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_44 -; NONEON-NOSVE-NEXT: .LBB3_12: // %else32 -; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_45 -; NONEON-NOSVE-NEXT: .LBB3_13: // %else35 -; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_46 -; NONEON-NOSVE-NEXT: .LBB3_14: // %else38 -; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_47 -; NONEON-NOSVE-NEXT: .LBB3_15: // %else41 -; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_48 -; NONEON-NOSVE-NEXT: .LBB3_16: // %else44 -; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_49 -; NONEON-NOSVE-NEXT: .LBB3_17: // %else47 -; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_50 -; NONEON-NOSVE-NEXT: .LBB3_18: // %else50 -; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_51 -; NONEON-NOSVE-NEXT: .LBB3_19: // %else53 -; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_52 -; NONEON-NOSVE-NEXT: .LBB3_20: // %else56 -; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_53 -; NONEON-NOSVE-NEXT: .LBB3_21: // %else59 -; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_54 -; NONEON-NOSVE-NEXT: .LBB3_22: // %else62 -; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_55 -; NONEON-NOSVE-NEXT: .LBB3_23: // %else65 -; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_56 -; NONEON-NOSVE-NEXT: .LBB3_24: // %else68 -; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_57 -; NONEON-NOSVE-NEXT: .LBB3_25: // %else71 -; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_58 -; NONEON-NOSVE-NEXT: .LBB3_26: // %else74 -; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_59 -; NONEON-NOSVE-NEXT: .LBB3_27: // %else77 -; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_60 -; NONEON-NOSVE-NEXT: .LBB3_28: // %else80 -; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_61 -; NONEON-NOSVE-NEXT: .LBB3_29: // %else83 -; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_62 -; NONEON-NOSVE-NEXT: .LBB3_30: // %else86 -; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_63 -; NONEON-NOSVE-NEXT: .LBB3_31: // %else89 -; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_64 -; NONEON-NOSVE-NEXT: .LBB3_32: // %else92 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB3_33: // %cond.load -; NONEON-NOSVE-NEXT: ldr b0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB3_2 -; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #1 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB3_3 -; NONEON-NOSVE-NEXT: .LBB3_35: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB3_4 -; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.load7 -; NONEON-NOSVE-NEXT: add x9, x0, #3 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB3_5 -; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.load10 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[4], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB3_6 -; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.load13 -; NONEON-NOSVE-NEXT: add x9, x0, #5 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[5], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB3_7 -; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.load16 -; NONEON-NOSVE-NEXT: add x9, x0, #6 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB3_8 -; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.load19 -; NONEON-NOSVE-NEXT: add x9, x0, #7 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[7], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB3_9 -; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.load22 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[8], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB3_10 -; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.load25 -; NONEON-NOSVE-NEXT: add x9, x0, #9 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[9], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB3_11 -; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.load28 -; NONEON-NOSVE-NEXT: add x9, x0, #10 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[10], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB3_12 -; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.load31 -; NONEON-NOSVE-NEXT: add x9, x0, #11 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[11], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB3_13 -; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.load34 -; NONEON-NOSVE-NEXT: add x9, x0, #12 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[12], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB3_14 -; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.load37 -; NONEON-NOSVE-NEXT: add x9, x0, #13 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[13], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB3_15 -; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.load40 -; NONEON-NOSVE-NEXT: add x9, x0, #14 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[14], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB3_16 -; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.load43 -; NONEON-NOSVE-NEXT: add x9, x0, #15 -; NONEON-NOSVE-NEXT: ld1 { v0.b }[15], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #16, .LBB3_17 -; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.load46 -; NONEON-NOSVE-NEXT: add x9, x0, #16 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[0], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #17, .LBB3_18 -; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.load49 -; NONEON-NOSVE-NEXT: add x9, x0, #17 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #18, .LBB3_19 -; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.load52 -; NONEON-NOSVE-NEXT: add x9, x0, #18 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #19, .LBB3_20 -; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.load55 -; NONEON-NOSVE-NEXT: add x9, x0, #19 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #20, .LBB3_21 -; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.load58 -; NONEON-NOSVE-NEXT: add x9, x0, #20 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[4], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #21, .LBB3_22 -; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.load61 -; NONEON-NOSVE-NEXT: add x9, x0, #21 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[5], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #22, .LBB3_23 -; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.load64 -; NONEON-NOSVE-NEXT: add x9, x0, #22 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #23, .LBB3_24 -; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.load67 -; NONEON-NOSVE-NEXT: add x9, x0, #23 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[7], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #24, .LBB3_25 -; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.load70 -; NONEON-NOSVE-NEXT: add x9, x0, #24 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[8], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #25, .LBB3_26 -; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.load73 -; NONEON-NOSVE-NEXT: add x9, x0, #25 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[9], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #26, .LBB3_27 -; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.load76 -; NONEON-NOSVE-NEXT: add x9, x0, #26 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[10], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #27, .LBB3_28 -; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.load79 -; NONEON-NOSVE-NEXT: add x9, x0, #27 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[11], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #28, .LBB3_29 -; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.load82 -; NONEON-NOSVE-NEXT: add x9, x0, #28 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[12], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #29, .LBB3_30 -; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.load85 -; NONEON-NOSVE-NEXT: add x9, x0, #29 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[13], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #30, .LBB3_31 -; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.load88 -; NONEON-NOSVE-NEXT: add x9, x0, #30 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[14], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_32 -; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.load91 -; NONEON-NOSVE-NEXT: add x8, x0, #31 -; NONEON-NOSVE-NEXT: ld1 { v1.b }[15], [x8] +; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #2064 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 2080 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #2216] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #2152] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2272] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #2176] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #2160] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2024] +; NONEON-NOSVE-NEXT: and w8, w9, #0x1 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2264] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2016] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w9, #0x80 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2256] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2031] +; NONEON-NOSVE-NEXT: and w8, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2248] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2030] +; NONEON-NOSVE-NEXT: and w8, w9, #0x20 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2240] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2029] +; NONEON-NOSVE-NEXT: and w8, w9, #0x10 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2232] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2028] +; NONEON-NOSVE-NEXT: and w8, w9, #0x8 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2224] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2027] +; NONEON-NOSVE-NEXT: and w8, w9, #0x4 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2208] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2026] +; NONEON-NOSVE-NEXT: and w8, w9, #0x2 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2200] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2025] +; NONEON-NOSVE-NEXT: and w8, w9, #0x80 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2192] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2023] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #2184] +; NONEON-NOSVE-NEXT: and w9, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: strb w9, [sp, #2022] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #2168] +; NONEON-NOSVE-NEXT: and w10, w10, #0x20 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: strb w10, [sp, #2021] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2020] +; NONEON-NOSVE-NEXT: and w8, w11, #0x8 +; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2019] +; NONEON-NOSVE-NEXT: and w8, w9, #0x4 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #2088] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2018] +; NONEON-NOSVE-NEXT: and w8, w10, #0x2 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2136] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2017] +; NONEON-NOSVE-NEXT: and w8, w9, #0x1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #2144] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2008] +; NONEON-NOSVE-NEXT: and w8, w1, #0x1 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #2104] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2000] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #2080] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldr q0, [sp, #2016] +; NONEON-NOSVE-NEXT: and w8, w9, #0x80 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2128] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2015] +; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: and w8, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #2120] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2014] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #2112] +; NONEON-NOSVE-NEXT: and w9, w9, #0x20 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: strb w9, [sp, #2013] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #2096] +; NONEON-NOSVE-NEXT: and w10, w10, #0x10 +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: strb w10, [sp, #2012] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2011] +; NONEON-NOSVE-NEXT: and w8, w11, #0x4 +; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2010] +; NONEON-NOSVE-NEXT: and w8, w9, #0x2 +; NONEON-NOSVE-NEXT: sbfx w9, w7, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2009] +; NONEON-NOSVE-NEXT: and w8, w10, #0x80 +; NONEON-NOSVE-NEXT: sbfx w10, w6, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2007] +; NONEON-NOSVE-NEXT: and w8, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w5, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2006] +; NONEON-NOSVE-NEXT: and w8, w10, #0x20 +; NONEON-NOSVE-NEXT: sbfx w10, w4, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2005] +; NONEON-NOSVE-NEXT: and w8, w9, #0x10 +; NONEON-NOSVE-NEXT: sbfx w9, w3, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2004] +; NONEON-NOSVE-NEXT: and w8, w10, #0x8 +; NONEON-NOSVE-NEXT: sbfx w10, w2, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2003] +; NONEON-NOSVE-NEXT: and w8, w9, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2002] +; NONEON-NOSVE-NEXT: and w8, w10, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2001] +; NONEON-NOSVE-NEXT: str q0, [sp, #2048] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #2000] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2050] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2048] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #2052] +; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2054] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2056] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #2058] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #2060] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w12, w13 +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: add w9, w9, w10 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: str q0, [sp, #2032] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2034] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2032] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #2036] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #2038] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #2040] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #2042] +; NONEON-NOSVE-NEXT: add w10, w12, w11 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #2044] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #2062] +; NONEON-NOSVE-NEXT: add w13, w13, w14 +; NONEON-NOSVE-NEXT: add w14, w15, w16 +; NONEON-NOSVE-NEXT: add w10, w10, w13 +; NONEON-NOSVE-NEXT: add w11, w14, w11 +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #2046] +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w8, w12 +; NONEON-NOSVE-NEXT: add w8, w9, w13 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI3_0 +; NONEON-NOSVE-NEXT: bfi w8, w10, #16, #16 +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI3_0] +; NONEON-NOSVE-NEXT: add x9, sp, #1744 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB3_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: ldrb w10, [x0] +; NONEON-NOSVE-NEXT: strb wzr, [sp, #1999] +; NONEON-NOSVE-NEXT: sturh wzr, [x9, #253] +; NONEON-NOSVE-NEXT: stur wzr, [x9, #249] +; NONEON-NOSVE-NEXT: stur xzr, [x9, #241] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1984] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1984] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_3 +; NONEON-NOSVE-NEXT: b .LBB3_4 +; NONEON-NOSVE-NEXT: .LBB3_2: +; NONEON-NOSVE-NEXT: mov v0.16b, v1.16b +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB3_4 +; NONEON-NOSVE-NEXT: .LBB3_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #1] +; NONEON-NOSVE-NEXT: str q0, [sp, #1936] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1968] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1950] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1968] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1966] +; NONEON-NOSVE-NEXT: str q0, [sp, #1920] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #202] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #194] +; NONEON-NOSVE-NEXT: stur w10, [x9, #218] +; NONEON-NOSVE-NEXT: stur x11, [x9, #210] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1936] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1952] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1920] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1953] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1952] +; NONEON-NOSVE-NEXT: .LBB3_4: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_36 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_37 +; NONEON-NOSVE-NEXT: .LBB3_6: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_38 +; NONEON-NOSVE-NEXT: .LBB3_7: // %else11 +; NONEON-NOSVE-NEXT: add x9, sp, #1488 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_39 +; NONEON-NOSVE-NEXT: .LBB3_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_40 +; NONEON-NOSVE-NEXT: .LBB3_9: // %else17 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_41 +; NONEON-NOSVE-NEXT: .LBB3_10: // %else20 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_42 +; NONEON-NOSVE-NEXT: .LBB3_11: // %else23 +; NONEON-NOSVE-NEXT: add x9, sp, #1232 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_43 +; NONEON-NOSVE-NEXT: .LBB3_12: // %else26 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_44 +; NONEON-NOSVE-NEXT: .LBB3_13: // %else29 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_45 +; NONEON-NOSVE-NEXT: .LBB3_14: // %else32 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_46 +; NONEON-NOSVE-NEXT: .LBB3_15: // %else35 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_47 +; NONEON-NOSVE-NEXT: .LBB3_16: // %else38 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_48 +; NONEON-NOSVE-NEXT: .LBB3_17: // %else41 +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_49 +; NONEON-NOSVE-NEXT: .LBB3_18: // %else44 +; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_50 +; NONEON-NOSVE-NEXT: .LBB3_19: // %else47 +; NONEON-NOSVE-NEXT: add x9, sp, #720 +; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_51 +; NONEON-NOSVE-NEXT: .LBB3_20: // %else50 +; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_52 +; NONEON-NOSVE-NEXT: .LBB3_21: // %else53 +; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_53 +; NONEON-NOSVE-NEXT: .LBB3_22: // %else56 +; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_54 +; NONEON-NOSVE-NEXT: .LBB3_23: // %else59 +; NONEON-NOSVE-NEXT: add x9, sp, #464 +; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_55 +; NONEON-NOSVE-NEXT: .LBB3_24: // %else62 +; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_56 +; NONEON-NOSVE-NEXT: .LBB3_25: // %else65 +; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_57 +; NONEON-NOSVE-NEXT: .LBB3_26: // %else68 +; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_58 +; NONEON-NOSVE-NEXT: .LBB3_27: // %else71 +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_59 +; NONEON-NOSVE-NEXT: .LBB3_28: // %else74 +; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_60 +; NONEON-NOSVE-NEXT: .LBB3_29: // %else77 +; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_61 +; NONEON-NOSVE-NEXT: .LBB3_30: // %else80 +; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_62 +; NONEON-NOSVE-NEXT: .LBB3_31: // %else83 +; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_63 +; NONEON-NOSVE-NEXT: .LBB3_32: // %else86 +; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_64 +; NONEON-NOSVE-NEXT: .LBB3_33: // %else89 +; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_35 +; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.load91 +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #31] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: .LBB3_35: // %else92 +; NONEON-NOSVE-NEXT: add sp, sp, #2064 +; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.load4 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #2] +; NONEON-NOSVE-NEXT: str q0, [sp, #1872] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1904] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1887] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1904] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1903] +; NONEON-NOSVE-NEXT: str q0, [sp, #1856] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #139] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #131] +; NONEON-NOSVE-NEXT: stur w10, [x9, #155] +; NONEON-NOSVE-NEXT: stur x11, [x9, #147] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1872] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1888] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1856] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1890] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1888] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB3_6 +; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.load7 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #3] +; NONEON-NOSVE-NEXT: str q0, [sp, #1808] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1840] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #1820] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1840] +; NONEON-NOSVE-NEXT: str w10, [sp, #1836] +; NONEON-NOSVE-NEXT: str q0, [sp, #1792] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #68] +; NONEON-NOSVE-NEXT: stur x10, [x9, #84] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1810] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1826] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1808] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1824] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1792] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1827] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1824] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB3_7 +; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.load10 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #4] +; NONEON-NOSVE-NEXT: str q0, [sp, #1744] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1776] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1759] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1776] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1775] +; NONEON-NOSVE-NEXT: str q0, [sp, #1728] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #5] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: stur x11, [x9, #21] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #1744] +; NONEON-NOSVE-NEXT: str w9, [sp, #1760] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1728] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1764] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1760] +; NONEON-NOSVE-NEXT: add x9, sp, #1488 +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB3_8 +; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.load13 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #5] +; NONEON-NOSVE-NEXT: str q0, [sp, #1680] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1712] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1694] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1712] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1710] +; NONEON-NOSVE-NEXT: str q0, [sp, #1664] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #198] +; NONEON-NOSVE-NEXT: stur x10, [x9, #214] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1684] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1700] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #1680] +; NONEON-NOSVE-NEXT: str w10, [sp, #1696] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1664] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1701] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1696] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB3_9 +; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.load16 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #6] +; NONEON-NOSVE-NEXT: str q0, [sp, #1616] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1648] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1631] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1648] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1647] +; NONEON-NOSVE-NEXT: str q0, [sp, #1600] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #135] +; NONEON-NOSVE-NEXT: stur x10, [x9, #151] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1620] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1636] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #1616] +; NONEON-NOSVE-NEXT: str w10, [sp, #1632] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1600] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1638] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1632] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB3_10 +; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.load19 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #7] +; NONEON-NOSVE-NEXT: str q0, [sp, #1552] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1584] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #1560] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1584] +; NONEON-NOSVE-NEXT: str x10, [sp, #1576] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1558] +; NONEON-NOSVE-NEXT: str q0, [sp, #1536] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1574] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1556] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1572] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #1552] +; NONEON-NOSVE-NEXT: str w10, [sp, #1568] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1536] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1575] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1568] +; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB3_11 +; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.load22 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #1488] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1520] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1503] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1520] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1519] +; NONEON-NOSVE-NEXT: str q0, [sp, #1472] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: ldur w11, [x9, #9] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: stur w11, [x9, #25] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #1488] +; NONEON-NOSVE-NEXT: str x9, [sp, #1504] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1472] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1512] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1504] +; NONEON-NOSVE-NEXT: add x9, sp, #1232 +; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB3_12 +; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.load25 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #9] +; NONEON-NOSVE-NEXT: str q0, [sp, #1424] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1456] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1438] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1456] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1454] +; NONEON-NOSVE-NEXT: str q0, [sp, #1408] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #202] +; NONEON-NOSVE-NEXT: stur w10, [x9, #218] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1432] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1448] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #1424] +; NONEON-NOSVE-NEXT: str x10, [sp, #1440] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1408] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1449] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1440] +; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB3_13 +; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.load28 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #10] +; NONEON-NOSVE-NEXT: str q0, [sp, #1360] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1392] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1375] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1392] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1391] +; NONEON-NOSVE-NEXT: str q0, [sp, #1344] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #139] +; NONEON-NOSVE-NEXT: stur w10, [x9, #155] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1368] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1384] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #1360] +; NONEON-NOSVE-NEXT: str x10, [sp, #1376] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1344] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1386] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1376] +; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB3_14 +; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.load31 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #11] +; NONEON-NOSVE-NEXT: str q0, [sp, #1296] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1328] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #1308] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1328] +; NONEON-NOSVE-NEXT: str w10, [sp, #1324] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1306] +; NONEON-NOSVE-NEXT: str q0, [sp, #1280] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1322] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1304] +; NONEON-NOSVE-NEXT: strh w10, [sp, #1320] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #1296] +; NONEON-NOSVE-NEXT: str x10, [sp, #1312] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1280] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1323] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1312] +; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB3_15 +; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.load34 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #1232] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1264] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1247] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1264] +; NONEON-NOSVE-NEXT: strb w10, [sp, #1263] +; NONEON-NOSVE-NEXT: str q0, [sp, #1216] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #1240] +; NONEON-NOSVE-NEXT: str w9, [sp, #1256] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #1232] +; NONEON-NOSVE-NEXT: str x9, [sp, #1248] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1216] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1260] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1248] +; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB3_16 +; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.load37 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #13] +; NONEON-NOSVE-NEXT: str q0, [sp, #1168] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1200] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #1182] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1200] +; NONEON-NOSVE-NEXT: strh w9, [sp, #1198] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1180] +; NONEON-NOSVE-NEXT: str q0, [sp, #1152] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1196] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #1176] +; NONEON-NOSVE-NEXT: str w9, [sp, #1192] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #1168] +; NONEON-NOSVE-NEXT: str x9, [sp, #1184] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1152] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1197] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1184] +; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB3_17 +; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.load40 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #14] +; NONEON-NOSVE-NEXT: str q0, [sp, #1104] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1136] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1119] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1136] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1135] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #1116] +; NONEON-NOSVE-NEXT: str q0, [sp, #1088] +; NONEON-NOSVE-NEXT: strh w9, [sp, #1132] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #1112] +; NONEON-NOSVE-NEXT: str w9, [sp, #1128] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #1104] +; NONEON-NOSVE-NEXT: str x9, [sp, #1120] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1088] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1134] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1120] +; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB3_18 +; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.load43 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #15] +; NONEON-NOSVE-NEXT: str q0, [sp, #1024] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1072] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1038] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1072] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1070] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #1036] +; NONEON-NOSVE-NEXT: str q0, [sp, #1040] +; NONEON-NOSVE-NEXT: strh w9, [sp, #1068] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #1032] +; NONEON-NOSVE-NEXT: str w9, [sp, #1064] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #1024] +; NONEON-NOSVE-NEXT: str x9, [sp, #1056] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1040] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1071] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #1056] +; NONEON-NOSVE-NEXT: tbz w8, #16, .LBB3_19 +; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.load46 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [sp, #976] +; NONEON-NOSVE-NEXT: add x10, sp, #976 +; NONEON-NOSVE-NEXT: strb w9, [sp, #1008] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #991] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #1008] +; NONEON-NOSVE-NEXT: strb w9, [sp, #1007] +; NONEON-NOSVE-NEXT: str q1, [sp, #960] +; NONEON-NOSVE-NEXT: ldurh w9, [x10, #13] +; NONEON-NOSVE-NEXT: ldur w11, [x10, #9] +; NONEON-NOSVE-NEXT: sturh w9, [x10, #29] +; NONEON-NOSVE-NEXT: ldur x9, [x10, #1] +; NONEON-NOSVE-NEXT: stur w11, [x10, #25] +; NONEON-NOSVE-NEXT: stur x9, [x10, #17] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #960] +; NONEON-NOSVE-NEXT: strb w9, [sp, #992] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #992] +; NONEON-NOSVE-NEXT: add x9, sp, #720 +; NONEON-NOSVE-NEXT: tbz w8, #17, .LBB3_20 +; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.load49 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #17] +; NONEON-NOSVE-NEXT: str q1, [sp, #912] +; NONEON-NOSVE-NEXT: strb w10, [sp, #944] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #926] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #944] +; NONEON-NOSVE-NEXT: strh w10, [sp, #942] +; NONEON-NOSVE-NEXT: str q1, [sp, #896] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #202] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #194] +; NONEON-NOSVE-NEXT: stur w10, [x9, #218] +; NONEON-NOSVE-NEXT: stur x11, [x9, #210] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #912] +; NONEON-NOSVE-NEXT: strb w10, [sp, #928] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #896] +; NONEON-NOSVE-NEXT: strb w10, [sp, #929] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #928] +; NONEON-NOSVE-NEXT: tbz w8, #18, .LBB3_21 +; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.load52 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #18] +; NONEON-NOSVE-NEXT: str q1, [sp, #848] +; NONEON-NOSVE-NEXT: strb w10, [sp, #880] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #863] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #880] +; NONEON-NOSVE-NEXT: strb w10, [sp, #879] +; NONEON-NOSVE-NEXT: str q1, [sp, #832] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #139] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #131] +; NONEON-NOSVE-NEXT: stur w10, [x9, #155] +; NONEON-NOSVE-NEXT: stur x11, [x9, #147] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #848] +; NONEON-NOSVE-NEXT: strh w10, [sp, #864] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #832] +; NONEON-NOSVE-NEXT: strb w10, [sp, #866] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #864] +; NONEON-NOSVE-NEXT: tbz w8, #19, .LBB3_22 +; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.load55 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #19] +; NONEON-NOSVE-NEXT: str q1, [sp, #784] +; NONEON-NOSVE-NEXT: strb w10, [sp, #816] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #796] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #816] +; NONEON-NOSVE-NEXT: str w10, [sp, #812] +; NONEON-NOSVE-NEXT: str q1, [sp, #768] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #68] +; NONEON-NOSVE-NEXT: stur x10, [x9, #84] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #786] +; NONEON-NOSVE-NEXT: strb w10, [sp, #802] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #784] +; NONEON-NOSVE-NEXT: strh w10, [sp, #800] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #768] +; NONEON-NOSVE-NEXT: strb w10, [sp, #803] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #800] +; NONEON-NOSVE-NEXT: tbz w8, #20, .LBB3_23 +; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.load58 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #20] +; NONEON-NOSVE-NEXT: str q1, [sp, #720] +; NONEON-NOSVE-NEXT: strb w10, [sp, #752] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #735] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #752] +; NONEON-NOSVE-NEXT: strb w10, [sp, #751] +; NONEON-NOSVE-NEXT: str q1, [sp, #704] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #5] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: stur x11, [x9, #21] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #720] +; NONEON-NOSVE-NEXT: str w9, [sp, #736] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #704] +; NONEON-NOSVE-NEXT: strb w9, [sp, #740] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #736] +; NONEON-NOSVE-NEXT: add x9, sp, #464 +; NONEON-NOSVE-NEXT: tbz w8, #21, .LBB3_24 +; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.load61 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #21] +; NONEON-NOSVE-NEXT: str q1, [sp, #656] +; NONEON-NOSVE-NEXT: strb w10, [sp, #688] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #670] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #688] +; NONEON-NOSVE-NEXT: strh w10, [sp, #686] +; NONEON-NOSVE-NEXT: str q1, [sp, #640] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #198] +; NONEON-NOSVE-NEXT: stur x10, [x9, #214] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #660] +; NONEON-NOSVE-NEXT: strb w10, [sp, #676] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #656] +; NONEON-NOSVE-NEXT: str w10, [sp, #672] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #640] +; NONEON-NOSVE-NEXT: strb w10, [sp, #677] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #672] +; NONEON-NOSVE-NEXT: tbz w8, #22, .LBB3_25 +; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.load64 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #22] +; NONEON-NOSVE-NEXT: str q1, [sp, #592] +; NONEON-NOSVE-NEXT: strb w10, [sp, #624] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #607] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #624] +; NONEON-NOSVE-NEXT: strb w10, [sp, #623] +; NONEON-NOSVE-NEXT: str q1, [sp, #576] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #135] +; NONEON-NOSVE-NEXT: stur x10, [x9, #151] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #596] +; NONEON-NOSVE-NEXT: strh w10, [sp, #612] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #592] +; NONEON-NOSVE-NEXT: str w10, [sp, #608] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #576] +; NONEON-NOSVE-NEXT: strb w10, [sp, #614] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #608] +; NONEON-NOSVE-NEXT: tbz w8, #23, .LBB3_26 +; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.load67 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #23] +; NONEON-NOSVE-NEXT: str q1, [sp, #528] +; NONEON-NOSVE-NEXT: strb w10, [sp, #560] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #536] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #560] +; NONEON-NOSVE-NEXT: str x10, [sp, #552] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #534] +; NONEON-NOSVE-NEXT: str q1, [sp, #512] +; NONEON-NOSVE-NEXT: strb w10, [sp, #550] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #532] +; NONEON-NOSVE-NEXT: strh w10, [sp, #548] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #528] +; NONEON-NOSVE-NEXT: str w10, [sp, #544] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #512] +; NONEON-NOSVE-NEXT: strb w10, [sp, #551] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #544] +; NONEON-NOSVE-NEXT: tbz w8, #24, .LBB3_27 +; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.load70 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #24] +; NONEON-NOSVE-NEXT: str q1, [sp, #464] +; NONEON-NOSVE-NEXT: strb w10, [sp, #496] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #479] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #496] +; NONEON-NOSVE-NEXT: strb w10, [sp, #495] +; NONEON-NOSVE-NEXT: str q1, [sp, #448] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: ldur w11, [x9, #9] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: stur w11, [x9, #25] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #464] +; NONEON-NOSVE-NEXT: str x9, [sp, #480] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #448] +; NONEON-NOSVE-NEXT: strb w9, [sp, #488] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #480] +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: tbz w8, #25, .LBB3_28 +; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.load73 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #25] +; NONEON-NOSVE-NEXT: str q1, [sp, #400] +; NONEON-NOSVE-NEXT: strb w10, [sp, #432] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #414] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #432] +; NONEON-NOSVE-NEXT: strh w10, [sp, #430] +; NONEON-NOSVE-NEXT: str q1, [sp, #384] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #202] +; NONEON-NOSVE-NEXT: stur w10, [x9, #218] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #408] +; NONEON-NOSVE-NEXT: strb w10, [sp, #424] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #400] +; NONEON-NOSVE-NEXT: str x10, [sp, #416] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #384] +; NONEON-NOSVE-NEXT: strb w10, [sp, #425] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #416] +; NONEON-NOSVE-NEXT: tbz w8, #26, .LBB3_29 +; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.load76 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #26] +; NONEON-NOSVE-NEXT: str q1, [sp, #336] +; NONEON-NOSVE-NEXT: strb w10, [sp, #368] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #351] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #368] +; NONEON-NOSVE-NEXT: strb w10, [sp, #367] +; NONEON-NOSVE-NEXT: str q1, [sp, #320] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #139] +; NONEON-NOSVE-NEXT: stur w10, [x9, #155] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #344] +; NONEON-NOSVE-NEXT: strh w10, [sp, #360] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #336] +; NONEON-NOSVE-NEXT: str x10, [sp, #352] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #320] +; NONEON-NOSVE-NEXT: strb w10, [sp, #362] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #352] +; NONEON-NOSVE-NEXT: tbz w8, #27, .LBB3_30 +; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.load79 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #27] +; NONEON-NOSVE-NEXT: str q1, [sp, #272] +; NONEON-NOSVE-NEXT: strb w10, [sp, #304] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #284] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #304] +; NONEON-NOSVE-NEXT: str w10, [sp, #300] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #282] +; NONEON-NOSVE-NEXT: str q1, [sp, #256] +; NONEON-NOSVE-NEXT: strb w10, [sp, #298] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #280] +; NONEON-NOSVE-NEXT: strh w10, [sp, #296] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #272] +; NONEON-NOSVE-NEXT: str x10, [sp, #288] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #256] +; NONEON-NOSVE-NEXT: strb w10, [sp, #299] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #288] +; NONEON-NOSVE-NEXT: tbz w8, #28, .LBB3_31 +; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.load82 +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #28] +; NONEON-NOSVE-NEXT: str q1, [sp, #208] +; NONEON-NOSVE-NEXT: strb w10, [sp, #240] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #223] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #240] +; NONEON-NOSVE-NEXT: strb w10, [sp, #239] +; NONEON-NOSVE-NEXT: str q1, [sp, #192] +; NONEON-NOSVE-NEXT: ldurh w10, [x9, #13] +; NONEON-NOSVE-NEXT: sturh w10, [x9, #29] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #216] +; NONEON-NOSVE-NEXT: str w9, [sp, #232] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #208] +; NONEON-NOSVE-NEXT: str x9, [sp, #224] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #192] +; NONEON-NOSVE-NEXT: strb w9, [sp, #236] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #224] +; NONEON-NOSVE-NEXT: tbz w8, #29, .LBB3_32 +; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.load85 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #29] +; NONEON-NOSVE-NEXT: str q1, [sp, #144] +; NONEON-NOSVE-NEXT: strb w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #158] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #176] +; NONEON-NOSVE-NEXT: strh w9, [sp, #174] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #156] +; NONEON-NOSVE-NEXT: str q1, [sp, #128] +; NONEON-NOSVE-NEXT: strb w9, [sp, #172] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #152] +; NONEON-NOSVE-NEXT: str w9, [sp, #168] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #144] +; NONEON-NOSVE-NEXT: str x9, [sp, #160] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #128] +; NONEON-NOSVE-NEXT: strb w9, [sp, #173] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #160] +; NONEON-NOSVE-NEXT: tbz w8, #30, .LBB3_33 +; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.load88 +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #30] +; NONEON-NOSVE-NEXT: str q1, [sp, #80] +; NONEON-NOSVE-NEXT: strb w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #95] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #112] +; NONEON-NOSVE-NEXT: strb w9, [sp, #111] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #92] +; NONEON-NOSVE-NEXT: str q1, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #108] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #88] +; NONEON-NOSVE-NEXT: str w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #80] +; NONEON-NOSVE-NEXT: str x9, [sp, #96] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #64] +; NONEON-NOSVE-NEXT: strb w9, [sp, #110] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #96] +; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_34 +; NONEON-NOSVE-NEXT: b .LBB3_35 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %src, i32 8, <32 x i1> %mask, <32 x i8> zeroinitializer) ret <32 x i8> %load } @@ -638,27 +1642,36 @@ define <2 x half> @masked_load_v2f16(ptr %src, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI4_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] -; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addp v1.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB4_3 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB4_4 -; NONEON-NOSVE-NEXT: .LBB4_2: // %else2 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB4_3: // %cond.load -; NONEON-NOSVE-NEXT: ldr h0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB4_2 -; NONEON-NOSVE-NEXT: .LBB4_4: // %cond.load1 -; NONEON-NOSVE-NEXT: add x8, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[1], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: str d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: bfxil w8, w9, #0, #1 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB4_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: ldr h1, [x0] +; NONEON-NOSVE-NEXT: str h1, [sp, #24] +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB4_3 +; NONEON-NOSVE-NEXT: b .LBB4_4 +; NONEON-NOSVE-NEXT: .LBB4_2: +; NONEON-NOSVE-NEXT: adrp x9, .LCPI4_0 +; NONEON-NOSVE-NEXT: ldr d0, [x9, :lo12:.LCPI4_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB4_4 +; NONEON-NOSVE-NEXT: .LBB4_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #2] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #8] +; NONEON-NOSVE-NEXT: str h1, [sp, #18] +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: .LBB4_4: // %else2 +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %load = call <2 x half> @llvm.masked.load.v2f16(ptr %src, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer) ret <2 x half> %load @@ -678,39 +1691,84 @@ define <4 x half> @masked_load_v4f16(ptr %src, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI5_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI5_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h1, v0.4h -; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB5_5 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB5_6 -; NONEON-NOSVE-NEXT: .LBB5_2: // %else2 +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: str d0, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #114] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #116] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #118] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #112] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB5_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: ldr h1, [x0] +; NONEON-NOSVE-NEXT: stur wzr, [sp, #106] +; NONEON-NOSVE-NEXT: str h1, [sp, #104] +; NONEON-NOSVE-NEXT: str h0, [sp, #110] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #104] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB5_3 +; NONEON-NOSVE-NEXT: b .LBB5_4 +; NONEON-NOSVE-NEXT: .LBB5_2: +; NONEON-NOSVE-NEXT: adrp x9, .LCPI5_0 +; NONEON-NOSVE-NEXT: ldr d0, [x9, :lo12:.LCPI5_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB5_4 +; NONEON-NOSVE-NEXT: .LBB5_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #2] +; NONEON-NOSVE-NEXT: str d0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #84] +; NONEON-NOSVE-NEXT: str h1, [sp, #96] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #96] +; NONEON-NOSVE-NEXT: str w9, [sp, #92] +; NONEON-NOSVE-NEXT: str d0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #80] +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #72] +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #88] +; NONEON-NOSVE-NEXT: .LBB5_4: // %else2 ; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB5_7 -; NONEON-NOSVE-NEXT: .LBB5_3: // %else5 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB5_8 -; NONEON-NOSVE-NEXT: .LBB5_4: // %else8 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: .LBB5_6: // %else8 +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB5_5: // %cond.load -; NONEON-NOSVE-NEXT: ldr h0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB5_2 -; NONEON-NOSVE-NEXT: .LBB5_6: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB5_3 ; NONEON-NOSVE-NEXT: .LBB5_7: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB5_4 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #4] +; NONEON-NOSVE-NEXT: str d0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] +; NONEON-NOSVE-NEXT: str h1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] +; NONEON-NOSVE-NEXT: str w9, [sp, #56] +; NONEON-NOSVE-NEXT: str d0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: str h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #40] +; NONEON-NOSVE-NEXT: str h0, [sp, #60] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB5_6 ; NONEON-NOSVE-NEXT: .LBB5_8: // %cond.load7 -; NONEON-NOSVE-NEXT: add x8, x0, #6 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[3], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #6] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str h1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #24] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %load = call <4 x half> @llvm.masked.load.v4f16(ptr %src, i32 8, <4 x i1> %mask, <4 x half> zeroinitializer) ret <4 x half> %load @@ -731,62 +1789,184 @@ define <8 x half> @masked_load_v8f16(ptr %src, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI6_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI6_0] -; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv b1, v0.8b -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB6_9 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB6_10 -; NONEON-NOSVE-NEXT: .LBB6_2: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB6_11 -; NONEON-NOSVE-NEXT: .LBB6_3: // %else5 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB6_12 -; NONEON-NOSVE-NEXT: .LBB6_4: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB6_13 -; NONEON-NOSVE-NEXT: .LBB6_5: // %else11 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB6_14 -; NONEON-NOSVE-NEXT: .LBB6_6: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB6_15 -; NONEON-NOSVE-NEXT: .LBB6_7: // %else17 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB6_16 -; NONEON-NOSVE-NEXT: .LBB6_8: // %else20 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB6_9: // %cond.load -; NONEON-NOSVE-NEXT: ldr h0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB6_2 -; NONEON-NOSVE-NEXT: .LBB6_10: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB6_3 -; NONEON-NOSVE-NEXT: .LBB6_11: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB6_4 -; NONEON-NOSVE-NEXT: .LBB6_12: // %cond.load7 -; NONEON-NOSVE-NEXT: add x9, x0, #6 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB6_5 -; NONEON-NOSVE-NEXT: .LBB6_13: // %cond.load10 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[4], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB6_6 -; NONEON-NOSVE-NEXT: .LBB6_14: // %cond.load13 -; NONEON-NOSVE-NEXT: add x9, x0, #10 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[5], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB6_7 -; NONEON-NOSVE-NEXT: .LBB6_15: // %cond.load16 -; NONEON-NOSVE-NEXT: add x9, x0, #12 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB6_8 -; NONEON-NOSVE-NEXT: .LBB6_16: // %cond.load19 -; NONEON-NOSVE-NEXT: add x8, x0, #14 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[7], [x8] +; NONEON-NOSVE-NEXT: sub sp, sp, #496 +; NONEON-NOSVE-NEXT: str x29, [sp, #480] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 496 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str d0, [sp, #464] +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #466] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #467] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #465] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #468] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #469] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #470] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #464] +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w8, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #471] +; NONEON-NOSVE-NEXT: and w11, w11, #0x2 +; NONEON-NOSVE-NEXT: and w13, w13, #0x10 +; NONEON-NOSVE-NEXT: bfxil w11, w12, #0, #1 +; NONEON-NOSVE-NEXT: and w12, w14, #0x20 +; NONEON-NOSVE-NEXT: orr w8, w8, w13 +; NONEON-NOSVE-NEXT: and w13, w15, #0x40 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w11, w8 +; NONEON-NOSVE-NEXT: orr w11, w12, w13 +; NONEON-NOSVE-NEXT: orr w8, w8, w11 +; NONEON-NOSVE-NEXT: and w10, w10, #0x80 +; NONEON-NOSVE-NEXT: add w10, w8, w10 +; NONEON-NOSVE-NEXT: and w8, w10, #0xff +; NONEON-NOSVE-NEXT: tbz w10, #0, .LBB6_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: ldr h1, [x0] +; NONEON-NOSVE-NEXT: stur wzr, [x9, #250] +; NONEON-NOSVE-NEXT: stur xzr, [x9, #242] +; NONEON-NOSVE-NEXT: str h1, [sp, #448] +; NONEON-NOSVE-NEXT: str h0, [sp, #462] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #448] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB6_3 +; NONEON-NOSVE-NEXT: b .LBB6_4 +; NONEON-NOSVE-NEXT: .LBB6_2: +; NONEON-NOSVE-NEXT: adrp x10, .LCPI6_0 +; NONEON-NOSVE-NEXT: ldr q0, [x10, :lo12:.LCPI6_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB6_4 +; NONEON-NOSVE-NEXT: .LBB6_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #2] +; NONEON-NOSVE-NEXT: str q0, [sp, #400] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #412] +; NONEON-NOSVE-NEXT: str h1, [sp, #432] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #432] +; NONEON-NOSVE-NEXT: str w10, [sp, #428] +; NONEON-NOSVE-NEXT: str q0, [sp, #384] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #196] +; NONEON-NOSVE-NEXT: stur x10, [x9, #212] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #400] +; NONEON-NOSVE-NEXT: str h0, [sp, #416] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #384] +; NONEON-NOSVE-NEXT: str h0, [sp, #418] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #416] +; NONEON-NOSVE-NEXT: .LBB6_4: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB6_12 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB6_13 +; NONEON-NOSVE-NEXT: .LBB6_6: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB6_14 +; NONEON-NOSVE-NEXT: .LBB6_7: // %else11 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB6_15 +; NONEON-NOSVE-NEXT: .LBB6_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB6_16 +; NONEON-NOSVE-NEXT: .LBB6_9: // %else17 +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB6_11 +; NONEON-NOSVE-NEXT: .LBB6_10: // %cond.load19 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #14] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str h1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: .LBB6_11: // %else20 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #480] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #496 ; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB6_12: // %cond.load4 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #4] +; NONEON-NOSVE-NEXT: str q0, [sp, #336] +; NONEON-NOSVE-NEXT: str h1, [sp, #368] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #368] +; NONEON-NOSVE-NEXT: str q0, [sp, #320] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #350] +; NONEON-NOSVE-NEXT: str h0, [sp, #366] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #134] +; NONEON-NOSVE-NEXT: stur x10, [x9, #150] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #336] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #320] +; NONEON-NOSVE-NEXT: str w10, [sp, #352] +; NONEON-NOSVE-NEXT: str h0, [sp, #356] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #352] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB6_6 +; NONEON-NOSVE-NEXT: .LBB6_13: // %cond.load7 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #6] +; NONEON-NOSVE-NEXT: str q0, [sp, #272] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #280] +; NONEON-NOSVE-NEXT: str h1, [sp, #304] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #304] +; NONEON-NOSVE-NEXT: str x10, [sp, #296] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #272] +; NONEON-NOSVE-NEXT: str q0, [sp, #256] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #276] +; NONEON-NOSVE-NEXT: str w10, [sp, #288] +; NONEON-NOSVE-NEXT: str h0, [sp, #292] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #256] +; NONEON-NOSVE-NEXT: str h0, [sp, #294] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #288] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB6_7 +; NONEON-NOSVE-NEXT: .LBB6_14: // %cond.load10 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #208] +; NONEON-NOSVE-NEXT: str h1, [sp, #240] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #240] +; NONEON-NOSVE-NEXT: str q0, [sp, #192] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #222] +; NONEON-NOSVE-NEXT: str h0, [sp, #238] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #10] +; NONEON-NOSVE-NEXT: stur w10, [x9, #26] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #208] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #192] +; NONEON-NOSVE-NEXT: str x9, [sp, #224] +; NONEON-NOSVE-NEXT: str h0, [sp, #232] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #224] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB6_8 +; NONEON-NOSVE-NEXT: .LBB6_15: // %cond.load13 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #10] +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #156] +; NONEON-NOSVE-NEXT: str h1, [sp, #176] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #176] +; NONEON-NOSVE-NEXT: str w9, [sp, #172] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #144] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #152] +; NONEON-NOSVE-NEXT: str x9, [sp, #160] +; NONEON-NOSVE-NEXT: str h0, [sp, #168] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #128] +; NONEON-NOSVE-NEXT: str h0, [sp, #170] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #160] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB6_9 +; NONEON-NOSVE-NEXT: .LBB6_16: // %cond.load16 +; NONEON-NOSVE-NEXT: ldr h1, [x0, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #88] +; NONEON-NOSVE-NEXT: str h1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #112] +; NONEON-NOSVE-NEXT: str w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #80] +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #94] +; NONEON-NOSVE-NEXT: str x9, [sp, #96] +; NONEON-NOSVE-NEXT: str h0, [sp, #110] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #64] +; NONEON-NOSVE-NEXT: str h0, [sp, #108] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #96] +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB6_10 +; NONEON-NOSVE-NEXT: b .LBB6_11 %load = call <8 x half> @llvm.masked.load.v8f16(ptr %src, i32 8, <8 x i1> %mask, <8 x half> zeroinitializer) ret <8 x half> %load } @@ -814,113 +1994,383 @@ define <16 x half> @masked_load_v16f16(ptr %src, <16 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI7_0 -; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #1024 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 1040 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str q0, [sp, #976] +; NONEON-NOSVE-NEXT: adrp x9, .LCPI7_0 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #984] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1000] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #976] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #992] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #991] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1007] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #990] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1006] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #989] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1005] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #988] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1004] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #987] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1003] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #986] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1002] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #985] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1001] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #983] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #999] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #982] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #998] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #981] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #997] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #980] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #996] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #979] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #995] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #978] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #994] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #977] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #993] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #992] ; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: movi v1.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: addv h2, v0.8h -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s2 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB7_17 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB7_18 -; NONEON-NOSVE-NEXT: .LBB7_2: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB7_19 -; NONEON-NOSVE-NEXT: .LBB7_3: // %else5 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB7_20 -; NONEON-NOSVE-NEXT: .LBB7_4: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB7_21 -; NONEON-NOSVE-NEXT: .LBB7_5: // %else11 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB7_22 -; NONEON-NOSVE-NEXT: .LBB7_6: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB7_23 -; NONEON-NOSVE-NEXT: .LBB7_7: // %else17 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB7_24 -; NONEON-NOSVE-NEXT: .LBB7_8: // %else20 -; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB7_25 -; NONEON-NOSVE-NEXT: .LBB7_9: // %else23 -; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB7_26 -; NONEON-NOSVE-NEXT: .LBB7_10: // %else26 -; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB7_27 -; NONEON-NOSVE-NEXT: .LBB7_11: // %else29 -; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB7_28 -; NONEON-NOSVE-NEXT: .LBB7_12: // %else32 -; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB7_29 -; NONEON-NOSVE-NEXT: .LBB7_13: // %else35 -; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB7_30 -; NONEON-NOSVE-NEXT: .LBB7_14: // %else38 -; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB7_31 -; NONEON-NOSVE-NEXT: .LBB7_15: // %else41 -; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB7_32 -; NONEON-NOSVE-NEXT: .LBB7_16: // %else44 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB7_17: // %cond.load -; NONEON-NOSVE-NEXT: ldr h0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB7_2 -; NONEON-NOSVE-NEXT: .LBB7_18: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB7_3 -; NONEON-NOSVE-NEXT: .LBB7_19: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB7_4 -; NONEON-NOSVE-NEXT: .LBB7_20: // %cond.load7 -; NONEON-NOSVE-NEXT: add x9, x0, #6 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB7_5 -; NONEON-NOSVE-NEXT: .LBB7_21: // %cond.load10 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[4], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB7_6 -; NONEON-NOSVE-NEXT: .LBB7_22: // %cond.load13 -; NONEON-NOSVE-NEXT: add x9, x0, #10 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[5], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB7_7 -; NONEON-NOSVE-NEXT: .LBB7_23: // %cond.load16 -; NONEON-NOSVE-NEXT: add x9, x0, #12 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB7_8 -; NONEON-NOSVE-NEXT: .LBB7_24: // %cond.load19 -; NONEON-NOSVE-NEXT: add x9, x0, #14 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[7], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB7_9 -; NONEON-NOSVE-NEXT: .LBB7_25: // %cond.load22 -; NONEON-NOSVE-NEXT: add x9, x0, #16 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[0], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB7_10 -; NONEON-NOSVE-NEXT: .LBB7_26: // %cond.load25 -; NONEON-NOSVE-NEXT: add x9, x0, #18 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB7_11 -; NONEON-NOSVE-NEXT: .LBB7_27: // %cond.load28 -; NONEON-NOSVE-NEXT: add x9, x0, #20 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB7_12 -; NONEON-NOSVE-NEXT: .LBB7_28: // %cond.load31 -; NONEON-NOSVE-NEXT: add x9, x0, #22 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB7_13 -; NONEON-NOSVE-NEXT: .LBB7_29: // %cond.load34 -; NONEON-NOSVE-NEXT: add x9, x0, #24 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[4], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB7_14 -; NONEON-NOSVE-NEXT: .LBB7_30: // %cond.load37 -; NONEON-NOSVE-NEXT: add x9, x0, #26 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[5], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB7_15 -; NONEON-NOSVE-NEXT: .LBB7_31: // %cond.load40 -; NONEON-NOSVE-NEXT: add x9, x0, #28 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[6], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB7_16 -; NONEON-NOSVE-NEXT: .LBB7_32: // %cond.load43 -; NONEON-NOSVE-NEXT: add x8, x0, #30 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[7], [x8] +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI7_0] +; NONEON-NOSVE-NEXT: add x9, sp, #720 +; NONEON-NOSVE-NEXT: str q0, [sp, #1008] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #1010] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #1008] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #1012] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #1014] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #1016] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #1018] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #1020] +; NONEON-NOSVE-NEXT: add w8, w10, w8 +; NONEON-NOSVE-NEXT: add w10, w11, w12 +; NONEON-NOSVE-NEXT: add w11, w13, w14 +; NONEON-NOSVE-NEXT: add w8, w8, w10 +; NONEON-NOSVE-NEXT: add w10, w11, w15 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #1022] +; NONEON-NOSVE-NEXT: add w8, w8, w10 +; NONEON-NOSVE-NEXT: add w8, w8, w11 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB7_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: fmov s0, wzr +; NONEON-NOSVE-NEXT: ldr h2, [x0] +; NONEON-NOSVE-NEXT: stur wzr, [x9, #250] +; NONEON-NOSVE-NEXT: stur xzr, [x9, #242] +; NONEON-NOSVE-NEXT: str h2, [sp, #960] +; NONEON-NOSVE-NEXT: str h0, [sp, #974] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #960] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB7_3 +; NONEON-NOSVE-NEXT: b .LBB7_4 +; NONEON-NOSVE-NEXT: .LBB7_2: +; NONEON-NOSVE-NEXT: mov v0.16b, v1.16b +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB7_4 +; NONEON-NOSVE-NEXT: .LBB7_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #2] +; NONEON-NOSVE-NEXT: str q0, [sp, #912] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #924] +; NONEON-NOSVE-NEXT: str h2, [sp, #944] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #944] +; NONEON-NOSVE-NEXT: str w10, [sp, #940] +; NONEON-NOSVE-NEXT: str q0, [sp, #896] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #196] +; NONEON-NOSVE-NEXT: stur x10, [x9, #212] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #912] +; NONEON-NOSVE-NEXT: str h0, [sp, #928] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #896] +; NONEON-NOSVE-NEXT: str h0, [sp, #930] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #928] +; NONEON-NOSVE-NEXT: .LBB7_4: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB7_20 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB7_21 +; NONEON-NOSVE-NEXT: .LBB7_6: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB7_22 +; NONEON-NOSVE-NEXT: .LBB7_7: // %else11 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB7_23 +; NONEON-NOSVE-NEXT: .LBB7_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB7_24 +; NONEON-NOSVE-NEXT: .LBB7_9: // %else17 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB7_25 +; NONEON-NOSVE-NEXT: .LBB7_10: // %else20 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB7_26 +; NONEON-NOSVE-NEXT: .LBB7_11: // %else23 +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB7_27 +; NONEON-NOSVE-NEXT: .LBB7_12: // %else26 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB7_28 +; NONEON-NOSVE-NEXT: .LBB7_13: // %else29 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB7_29 +; NONEON-NOSVE-NEXT: .LBB7_14: // %else32 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB7_30 +; NONEON-NOSVE-NEXT: .LBB7_15: // %else35 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB7_31 +; NONEON-NOSVE-NEXT: .LBB7_16: // %else38 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB7_32 +; NONEON-NOSVE-NEXT: .LBB7_17: // %else41 +; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB7_19 +; NONEON-NOSVE-NEXT: .LBB7_18: // %cond.load43 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #30] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str h2, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str h1, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: str h1, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: .LBB7_19: // %else44 +; NONEON-NOSVE-NEXT: add sp, sp, #1024 +; NONEON-NOSVE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB7_20: // %cond.load4 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #4] +; NONEON-NOSVE-NEXT: str q0, [sp, #848] +; NONEON-NOSVE-NEXT: str h2, [sp, #880] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #880] +; NONEON-NOSVE-NEXT: str q0, [sp, #832] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #862] +; NONEON-NOSVE-NEXT: str h0, [sp, #878] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #134] +; NONEON-NOSVE-NEXT: stur x10, [x9, #150] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #848] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #832] +; NONEON-NOSVE-NEXT: str w10, [sp, #864] +; NONEON-NOSVE-NEXT: str h0, [sp, #868] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #864] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB7_6 +; NONEON-NOSVE-NEXT: .LBB7_21: // %cond.load7 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #6] +; NONEON-NOSVE-NEXT: str q0, [sp, #784] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #792] +; NONEON-NOSVE-NEXT: str h2, [sp, #816] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #816] +; NONEON-NOSVE-NEXT: str x10, [sp, #808] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #784] +; NONEON-NOSVE-NEXT: str q0, [sp, #768] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #788] +; NONEON-NOSVE-NEXT: str w10, [sp, #800] +; NONEON-NOSVE-NEXT: str h0, [sp, #804] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #768] +; NONEON-NOSVE-NEXT: str h0, [sp, #806] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #800] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB7_7 +; NONEON-NOSVE-NEXT: .LBB7_22: // %cond.load10 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #720] +; NONEON-NOSVE-NEXT: str h2, [sp, #752] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #752] +; NONEON-NOSVE-NEXT: str q0, [sp, #704] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #734] +; NONEON-NOSVE-NEXT: str h0, [sp, #750] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #10] +; NONEON-NOSVE-NEXT: stur w10, [x9, #26] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #720] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #704] +; NONEON-NOSVE-NEXT: str x9, [sp, #736] +; NONEON-NOSVE-NEXT: str h0, [sp, #744] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #736] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB7_8 +; NONEON-NOSVE-NEXT: .LBB7_23: // %cond.load13 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #10] +; NONEON-NOSVE-NEXT: str q0, [sp, #656] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #668] +; NONEON-NOSVE-NEXT: str h2, [sp, #688] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #688] +; NONEON-NOSVE-NEXT: str w9, [sp, #684] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #656] +; NONEON-NOSVE-NEXT: str q0, [sp, #640] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #664] +; NONEON-NOSVE-NEXT: str x9, [sp, #672] +; NONEON-NOSVE-NEXT: str h0, [sp, #680] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #640] +; NONEON-NOSVE-NEXT: str h0, [sp, #682] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #672] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB7_9 +; NONEON-NOSVE-NEXT: .LBB7_24: // %cond.load16 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #592] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #600] +; NONEON-NOSVE-NEXT: str h2, [sp, #624] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #624] +; NONEON-NOSVE-NEXT: str w9, [sp, #616] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #592] +; NONEON-NOSVE-NEXT: str q0, [sp, #576] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #606] +; NONEON-NOSVE-NEXT: str x9, [sp, #608] +; NONEON-NOSVE-NEXT: str h0, [sp, #622] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #576] +; NONEON-NOSVE-NEXT: str h0, [sp, #620] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #608] +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB7_10 +; NONEON-NOSVE-NEXT: .LBB7_25: // %cond.load19 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #14] +; NONEON-NOSVE-NEXT: str q0, [sp, #512] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #520] +; NONEON-NOSVE-NEXT: str h2, [sp, #560] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #560] +; NONEON-NOSVE-NEXT: str w9, [sp, #552] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #512] +; NONEON-NOSVE-NEXT: str q0, [sp, #528] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #524] +; NONEON-NOSVE-NEXT: str x9, [sp, #544] +; NONEON-NOSVE-NEXT: str h0, [sp, #556] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #528] +; NONEON-NOSVE-NEXT: str h0, [sp, #558] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #544] +; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB7_11 +; NONEON-NOSVE-NEXT: .LBB7_26: // %cond.load22 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [sp, #464] +; NONEON-NOSVE-NEXT: add x9, sp, #464 +; NONEON-NOSVE-NEXT: str h2, [sp, #496] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #496] +; NONEON-NOSVE-NEXT: str q1, [sp, #448] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #478] +; NONEON-NOSVE-NEXT: str h1, [sp, #494] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #10] +; NONEON-NOSVE-NEXT: ldur x11, [x9, #2] +; NONEON-NOSVE-NEXT: stur w10, [x9, #26] +; NONEON-NOSVE-NEXT: stur x11, [x9, #18] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #448] +; NONEON-NOSVE-NEXT: str h1, [sp, #480] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #480] +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB7_12 +; NONEON-NOSVE-NEXT: .LBB7_27: // %cond.load25 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #18] +; NONEON-NOSVE-NEXT: str q1, [sp, #400] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #412] +; NONEON-NOSVE-NEXT: str h2, [sp, #432] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #432] +; NONEON-NOSVE-NEXT: str w10, [sp, #428] +; NONEON-NOSVE-NEXT: str q1, [sp, #384] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #196] +; NONEON-NOSVE-NEXT: stur x10, [x9, #212] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #400] +; NONEON-NOSVE-NEXT: str h1, [sp, #416] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #384] +; NONEON-NOSVE-NEXT: str h1, [sp, #418] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #416] +; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB7_13 +; NONEON-NOSVE-NEXT: .LBB7_28: // %cond.load28 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #20] +; NONEON-NOSVE-NEXT: str q1, [sp, #336] +; NONEON-NOSVE-NEXT: str h2, [sp, #368] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #368] +; NONEON-NOSVE-NEXT: str q1, [sp, #320] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #350] +; NONEON-NOSVE-NEXT: str h1, [sp, #366] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #134] +; NONEON-NOSVE-NEXT: stur x10, [x9, #150] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #336] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #320] +; NONEON-NOSVE-NEXT: str w10, [sp, #352] +; NONEON-NOSVE-NEXT: str h1, [sp, #356] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #352] +; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB7_14 +; NONEON-NOSVE-NEXT: .LBB7_29: // %cond.load31 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #22] +; NONEON-NOSVE-NEXT: str q1, [sp, #272] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #280] +; NONEON-NOSVE-NEXT: str h2, [sp, #304] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #304] +; NONEON-NOSVE-NEXT: str x10, [sp, #296] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #272] +; NONEON-NOSVE-NEXT: str q1, [sp, #256] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #276] +; NONEON-NOSVE-NEXT: str w10, [sp, #288] +; NONEON-NOSVE-NEXT: str h1, [sp, #292] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #256] +; NONEON-NOSVE-NEXT: str h1, [sp, #294] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #288] +; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB7_15 +; NONEON-NOSVE-NEXT: .LBB7_30: // %cond.load34 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #24] +; NONEON-NOSVE-NEXT: str q1, [sp, #208] +; NONEON-NOSVE-NEXT: str h2, [sp, #240] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #240] +; NONEON-NOSVE-NEXT: str q1, [sp, #192] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #222] +; NONEON-NOSVE-NEXT: str h1, [sp, #238] +; NONEON-NOSVE-NEXT: ldur w10, [x9, #10] +; NONEON-NOSVE-NEXT: stur w10, [x9, #26] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #208] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #192] +; NONEON-NOSVE-NEXT: str x9, [sp, #224] +; NONEON-NOSVE-NEXT: str h1, [sp, #232] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #224] +; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB7_16 +; NONEON-NOSVE-NEXT: .LBB7_31: // %cond.load37 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #26] +; NONEON-NOSVE-NEXT: str q1, [sp, #144] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #156] +; NONEON-NOSVE-NEXT: str h2, [sp, #176] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #176] +; NONEON-NOSVE-NEXT: str w9, [sp, #172] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #144] +; NONEON-NOSVE-NEXT: str q1, [sp, #128] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #152] +; NONEON-NOSVE-NEXT: str x9, [sp, #160] +; NONEON-NOSVE-NEXT: str h1, [sp, #168] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #128] +; NONEON-NOSVE-NEXT: str h1, [sp, #170] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #160] +; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB7_17 +; NONEON-NOSVE-NEXT: .LBB7_32: // %cond.load40 +; NONEON-NOSVE-NEXT: ldr h2, [x0, #28] +; NONEON-NOSVE-NEXT: str q1, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #88] +; NONEON-NOSVE-NEXT: str h2, [sp, #112] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #112] +; NONEON-NOSVE-NEXT: str w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr x9, [sp, #80] +; NONEON-NOSVE-NEXT: str q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #94] +; NONEON-NOSVE-NEXT: str x9, [sp, #96] +; NONEON-NOSVE-NEXT: str h1, [sp, #110] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #64] +; NONEON-NOSVE-NEXT: str h1, [sp, #108] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #96] +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB7_18 +; NONEON-NOSVE-NEXT: b .LBB7_19 %load = call <16 x half> @llvm.masked.load.v16f16(ptr %src, i32 8, <16 x i1> %mask, <16 x half> zeroinitializer) ret <16 x half> %load } @@ -939,27 +2389,38 @@ define <2 x float> @masked_load_v2f32(ptr %src, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI8_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI8_0] -; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addp v1.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB8_3 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB8_4 -; NONEON-NOSVE-NEXT: .LBB8_2: // %else2 -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB8_3: // %cond.load +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str d0, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #48] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: bfxil w8, w9, #0, #1 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB8_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load ; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB8_2 -; NONEON-NOSVE-NEXT: .LBB8_4: // %cond.load1 -; NONEON-NOSVE-NEXT: add x8, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[1], [x8] -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NONEON-NOSVE-NEXT: str wzr, [sp, #44] +; NONEON-NOSVE-NEXT: str s0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB8_3 +; NONEON-NOSVE-NEXT: b .LBB8_4 +; NONEON-NOSVE-NEXT: .LBB8_2: +; NONEON-NOSVE-NEXT: adrp x9, .LCPI8_0 +; NONEON-NOSVE-NEXT: ldr d0, [x9, :lo12:.LCPI8_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB8_4 +; NONEON-NOSVE-NEXT: .LBB8_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr s1, [x0, #4] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: str s1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: .LBB8_4: // %else2 +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %load = call <2 x float> @llvm.masked.load.v2f32(ptr %src, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer) ret <2 x float> %load @@ -980,37 +2441,80 @@ define <4 x float> @masked_load_v4f32(ptr %src, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI9_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI9_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h1, v0.4h -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB9_5 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB9_6 -; NONEON-NOSVE-NEXT: .LBB9_2: // %else2 +; NONEON-NOSVE-NEXT: sub sp, sp, #224 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 224 +; NONEON-NOSVE-NEXT: str d0, [sp, #208] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #210] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #212] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #214] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #208] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB9_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: ldr s0, [x0] +; NONEON-NOSVE-NEXT: str wzr, [sp, #204] +; NONEON-NOSVE-NEXT: stur xzr, [sp, #196] +; NONEON-NOSVE-NEXT: str s0, [sp, #192] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #192] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB9_3 +; NONEON-NOSVE-NEXT: b .LBB9_4 +; NONEON-NOSVE-NEXT: .LBB9_2: +; NONEON-NOSVE-NEXT: adrp x9, .LCPI9_0 +; NONEON-NOSVE-NEXT: ldr q0, [x9, :lo12:.LCPI9_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB9_4 +; NONEON-NOSVE-NEXT: .LBB9_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr s1, [x0, #4] +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: str s1, [sp, #176] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #176] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #152] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #168] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #144] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #128] +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #160] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #160] +; NONEON-NOSVE-NEXT: .LBB9_4: // %else2 ; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB9_7 -; NONEON-NOSVE-NEXT: .LBB9_3: // %else5 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB9_8 -; NONEON-NOSVE-NEXT: .LBB9_4: // %else8 +; NONEON-NOSVE-NEXT: .LBB9_6: // %else8 +; NONEON-NOSVE-NEXT: add sp, sp, #224 ; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB9_5: // %cond.load -; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB9_2 -; NONEON-NOSVE-NEXT: .LBB9_6: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB9_3 ; NONEON-NOSVE-NEXT: .LBB9_7: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB9_4 +; NONEON-NOSVE-NEXT: ldr s1, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #80] +; NONEON-NOSVE-NEXT: str s1, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #92] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #112] +; NONEON-NOSVE-NEXT: str q0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #80] +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #96] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #64] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #96] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB9_6 ; NONEON-NOSVE-NEXT: .LBB9_8: // %cond.load7 -; NONEON-NOSVE-NEXT: add x8, x0, #12 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[3], [x8] +; NONEON-NOSVE-NEXT: ldr s1, [x0, #12] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: stp s1, s0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #224 ; NONEON-NOSVE-NEXT: ret %load = call <4 x float> @llvm.masked.load.v4f32(ptr %src, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer) ret <4 x float> %load @@ -1064,63 +2568,170 @@ define <8 x float> @masked_load_v8f32(ptr %src, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 +; NONEON-NOSVE-NEXT: sub sp, sp, #496 +; NONEON-NOSVE-NEXT: str x29, [sp, #480] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 496 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: str d0, [sp, #464] ; NONEON-NOSVE-NEXT: adrp x8, .LCPI10_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI10_0] -; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: movi v1.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: addv b2, v0.8b -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s2 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB10_9 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB10_10 -; NONEON-NOSVE-NEXT: .LBB10_2: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB10_11 -; NONEON-NOSVE-NEXT: .LBB10_3: // %else5 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB10_12 -; NONEON-NOSVE-NEXT: .LBB10_4: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB10_13 -; NONEON-NOSVE-NEXT: .LBB10_5: // %else11 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB10_14 -; NONEON-NOSVE-NEXT: .LBB10_6: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB10_15 -; NONEON-NOSVE-NEXT: .LBB10_7: // %else17 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB10_16 -; NONEON-NOSVE-NEXT: .LBB10_8: // %else20 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB10_9: // %cond.load +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #466] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #467] +; NONEON-NOSVE-NEXT: ldrb w11, [sp, #465] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #468] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #469] +; NONEON-NOSVE-NEXT: ldrb w15, [sp, #470] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #464] +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: sbfx w15, w15, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #471] +; NONEON-NOSVE-NEXT: and w11, w11, #0x2 +; NONEON-NOSVE-NEXT: and w13, w13, #0x10 +; NONEON-NOSVE-NEXT: bfxil w11, w12, #0, #1 +; NONEON-NOSVE-NEXT: and w12, w14, #0x20 +; NONEON-NOSVE-NEXT: orr w9, w9, w13 +; NONEON-NOSVE-NEXT: and w13, w15, #0x40 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w11, w9 +; NONEON-NOSVE-NEXT: orr w11, w12, w13 +; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI10_0] +; NONEON-NOSVE-NEXT: orr w9, w9, w11 +; NONEON-NOSVE-NEXT: and w10, w10, #0x80 +; NONEON-NOSVE-NEXT: add w10, w9, w10 +; NONEON-NOSVE-NEXT: add x9, sp, #208 +; NONEON-NOSVE-NEXT: and w8, w10, #0xff +; NONEON-NOSVE-NEXT: tbz w10, #0, .LBB10_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load ; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB10_2 -; NONEON-NOSVE-NEXT: .LBB10_10: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB10_3 -; NONEON-NOSVE-NEXT: .LBB10_11: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB10_4 -; NONEON-NOSVE-NEXT: .LBB10_12: // %cond.load7 -; NONEON-NOSVE-NEXT: add x9, x0, #12 -; NONEON-NOSVE-NEXT: ld1 { v0.s }[3], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB10_5 -; NONEON-NOSVE-NEXT: .LBB10_13: // %cond.load10 -; NONEON-NOSVE-NEXT: add x9, x0, #16 -; NONEON-NOSVE-NEXT: ld1 { v1.s }[0], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB10_6 -; NONEON-NOSVE-NEXT: .LBB10_14: // %cond.load13 -; NONEON-NOSVE-NEXT: add x9, x0, #20 -; NONEON-NOSVE-NEXT: ld1 { v1.s }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB10_7 -; NONEON-NOSVE-NEXT: .LBB10_15: // %cond.load16 -; NONEON-NOSVE-NEXT: add x9, x0, #24 -; NONEON-NOSVE-NEXT: ld1 { v1.s }[2], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB10_8 -; NONEON-NOSVE-NEXT: .LBB10_16: // %cond.load19 -; NONEON-NOSVE-NEXT: add x8, x0, #28 -; NONEON-NOSVE-NEXT: ld1 { v1.s }[3], [x8] +; NONEON-NOSVE-NEXT: str wzr, [sp, #460] +; NONEON-NOSVE-NEXT: stur xzr, [x9, #244] +; NONEON-NOSVE-NEXT: str s0, [sp, #448] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #448] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB10_3 +; NONEON-NOSVE-NEXT: b .LBB10_4 +; NONEON-NOSVE-NEXT: .LBB10_2: +; NONEON-NOSVE-NEXT: mov v0.16b, v1.16b +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB10_4 +; NONEON-NOSVE-NEXT: .LBB10_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #4] +; NONEON-NOSVE-NEXT: str q0, [sp, #400] +; NONEON-NOSVE-NEXT: str s2, [sp, #432] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #432] +; NONEON-NOSVE-NEXT: str q0, [sp, #384] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #412] +; NONEON-NOSVE-NEXT: str s0, [sp, #428] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #408] +; NONEON-NOSVE-NEXT: str s0, [sp, #424] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #400] +; NONEON-NOSVE-NEXT: str s0, [sp, #416] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #384] +; NONEON-NOSVE-NEXT: str s0, [sp, #420] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #416] +; NONEON-NOSVE-NEXT: .LBB10_4: // %else2 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB10_12 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB10_13 +; NONEON-NOSVE-NEXT: .LBB10_6: // %else8 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB10_14 +; NONEON-NOSVE-NEXT: .LBB10_7: // %else11 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB10_15 +; NONEON-NOSVE-NEXT: .LBB10_8: // %else14 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB10_16 +; NONEON-NOSVE-NEXT: .LBB10_9: // %else17 +; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB10_11 +; NONEON-NOSVE-NEXT: .LBB10_10: // %cond.load19 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #28] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str s2, [sp, #48] +; NONEON-NOSVE-NEXT: ldr s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: str x8, [sp, #32] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #16] +; NONEON-NOSVE-NEXT: stp s2, s1, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: .LBB10_11: // %else20 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #480] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #496 ; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: .LBB10_12: // %cond.load4 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #336] +; NONEON-NOSVE-NEXT: str s2, [sp, #368] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #368] +; NONEON-NOSVE-NEXT: str q0, [sp, #320] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #348] +; NONEON-NOSVE-NEXT: str s0, [sp, #364] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #340] +; NONEON-NOSVE-NEXT: str s0, [sp, #356] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #336] +; NONEON-NOSVE-NEXT: str s0, [sp, #352] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #320] +; NONEON-NOSVE-NEXT: str s0, [sp, #360] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #352] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB10_6 +; NONEON-NOSVE-NEXT: .LBB10_13: // %cond.load7 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #12] +; NONEON-NOSVE-NEXT: str q0, [sp, #256] +; NONEON-NOSVE-NEXT: ldr x10, [sp, #256] +; NONEON-NOSVE-NEXT: str s2, [sp, #304] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #304] +; NONEON-NOSVE-NEXT: str x10, [sp, #288] +; NONEON-NOSVE-NEXT: str q0, [sp, #272] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #264] +; NONEON-NOSVE-NEXT: str s0, [sp, #296] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #272] +; NONEON-NOSVE-NEXT: str s0, [sp, #300] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #288] +; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB10_7 +; NONEON-NOSVE-NEXT: .LBB10_14: // %cond.load10 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [sp, #208] +; NONEON-NOSVE-NEXT: str s2, [sp, #240] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #240] +; NONEON-NOSVE-NEXT: str q1, [sp, #192] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #220] +; NONEON-NOSVE-NEXT: str s1, [sp, #236] +; NONEON-NOSVE-NEXT: ldur x10, [x9, #4] +; NONEON-NOSVE-NEXT: stur x10, [x9, #20] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #192] +; NONEON-NOSVE-NEXT: str s1, [sp, #224] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #224] +; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB10_8 +; NONEON-NOSVE-NEXT: .LBB10_15: // %cond.load13 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #20] +; NONEON-NOSVE-NEXT: str q1, [sp, #144] +; NONEON-NOSVE-NEXT: str s2, [sp, #176] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #176] +; NONEON-NOSVE-NEXT: str q1, [sp, #128] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #152] +; NONEON-NOSVE-NEXT: stp s1, s2, [sp, #168] +; NONEON-NOSVE-NEXT: ldr s2, [sp, #144] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #128] +; NONEON-NOSVE-NEXT: stp s2, s1, [sp, #160] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #160] +; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB10_9 +; NONEON-NOSVE-NEXT: .LBB10_16: // %cond.load16 +; NONEON-NOSVE-NEXT: ldr s2, [x0, #24] +; NONEON-NOSVE-NEXT: str q1, [sp, #80] +; NONEON-NOSVE-NEXT: str s2, [sp, #112] +; NONEON-NOSVE-NEXT: ldr s2, [sp, #92] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #112] +; NONEON-NOSVE-NEXT: str q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldp s1, s3, [sp, #80] +; NONEON-NOSVE-NEXT: stp s1, s3, [sp, #96] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #64] +; NONEON-NOSVE-NEXT: stp s1, s2, [sp, #104] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #96] +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB10_10 +; NONEON-NOSVE-NEXT: b .LBB10_11 %load = call <8 x float> @llvm.masked.load.v8f32(ptr %src, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer) ret <8 x float> %load } @@ -1140,25 +2751,38 @@ define <2 x double> @masked_load_v2f64(ptr %src, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI11_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI11_0] -; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addp v1.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB11_3 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB11_4 -; NONEON-NOSVE-NEXT: .LBB11_2: // %else2 -; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB11_3: // %cond.load +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: str d0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #80] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: bfxil w8, w9, #0, #1 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB11_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB11_2 -; NONEON-NOSVE-NEXT: .LBB11_4: // %cond.load1 -; NONEON-NOSVE-NEXT: add x8, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.d }[1], [x8] +; NONEON-NOSVE-NEXT: str xzr, [sp, #72] +; NONEON-NOSVE-NEXT: str d0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB11_3 +; NONEON-NOSVE-NEXT: b .LBB11_4 +; NONEON-NOSVE-NEXT: .LBB11_2: +; NONEON-NOSVE-NEXT: adrp x9, .LCPI11_0 +; NONEON-NOSVE-NEXT: ldr q0, [x9, :lo12:.LCPI11_0] +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB11_4 +; NONEON-NOSVE-NEXT: .LBB11_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr d1, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp] +; NONEON-NOSVE-NEXT: str d1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d1, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: .LBB11_4: // %else2 +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %load = call <2 x double> @llvm.masked.load.v2f64(ptr %src, i32 8, <2 x i1> %mask, <2 x double> zeroinitializer) ret <2 x double> %load @@ -1188,38 +2812,74 @@ define <4 x double> @masked_load_v4f64(ptr %src, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_load_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI12_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI12_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: movi v1.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: addv h2, v0.4h -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 -; NONEON-NOSVE-NEXT: fmov w8, s2 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB12_5 -; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB12_6 -; NONEON-NOSVE-NEXT: .LBB12_2: // %else2 +; NONEON-NOSVE-NEXT: sub sp, sp, #224 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 224 +; NONEON-NOSVE-NEXT: str d0, [sp, #208] +; NONEON-NOSVE-NEXT: adrp x9, .LCPI12_0 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #210] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #212] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #214] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #208] +; NONEON-NOSVE-NEXT: ldr q1, [x9, :lo12:.LCPI12_0] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w10, w10, #0x4 +; NONEON-NOSVE-NEXT: and w11, w11, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w12, #0, #1 +; NONEON-NOSVE-NEXT: orr w10, w10, w11 +; NONEON-NOSVE-NEXT: orr w8, w8, w10 +; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB12_2 +; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load +; NONEON-NOSVE-NEXT: ldr d0, [x0] +; NONEON-NOSVE-NEXT: str xzr, [sp, #200] +; NONEON-NOSVE-NEXT: str d0, [sp, #192] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #192] +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB12_3 +; NONEON-NOSVE-NEXT: b .LBB12_4 +; NONEON-NOSVE-NEXT: .LBB12_2: +; NONEON-NOSVE-NEXT: mov v0.16b, v1.16b +; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB12_4 +; NONEON-NOSVE-NEXT: .LBB12_3: // %cond.load1 +; NONEON-NOSVE-NEXT: ldr d2, [x0, #8] +; NONEON-NOSVE-NEXT: str q0, [sp, #128] +; NONEON-NOSVE-NEXT: str d2, [sp, #176] +; NONEON-NOSVE-NEXT: ldr d2, [sp, #128] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #176] +; NONEON-NOSVE-NEXT: str q0, [sp, #144] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #144] +; NONEON-NOSVE-NEXT: stp d2, d0, [sp, #160] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #160] +; NONEON-NOSVE-NEXT: .LBB12_4: // %else2 ; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB12_7 -; NONEON-NOSVE-NEXT: .LBB12_3: // %else5 +; NONEON-NOSVE-NEXT: // %bb.5: // %else5 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB12_8 -; NONEON-NOSVE-NEXT: .LBB12_4: // %else8 +; NONEON-NOSVE-NEXT: .LBB12_6: // %else8 +; NONEON-NOSVE-NEXT: add sp, sp, #224 ; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB12_5: // %cond.load -; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB12_2 -; NONEON-NOSVE-NEXT: .LBB12_6: // %cond.load1 -; NONEON-NOSVE-NEXT: add x9, x0, #8 -; NONEON-NOSVE-NEXT: ld1 { v0.d }[1], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB12_3 ; NONEON-NOSVE-NEXT: .LBB12_7: // %cond.load4 -; NONEON-NOSVE-NEXT: add x9, x0, #16 -; NONEON-NOSVE-NEXT: ld1 { v1.d }[0], [x9] -; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB12_4 +; NONEON-NOSVE-NEXT: ldr d2, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [sp, #80] +; NONEON-NOSVE-NEXT: str d2, [sp, #112] +; NONEON-NOSVE-NEXT: ldr d2, [sp, #88] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #112] +; NONEON-NOSVE-NEXT: str q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #64] +; NONEON-NOSVE-NEXT: stp d1, d2, [sp, #96] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #96] +; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB12_6 ; NONEON-NOSVE-NEXT: .LBB12_8: // %cond.load7 -; NONEON-NOSVE-NEXT: add x8, x0, #24 -; NONEON-NOSVE-NEXT: ld1 { v1.d }[1], [x8] +; NONEON-NOSVE-NEXT: ldr d2, [x0, #24] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: str d2, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d2, [sp] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d2, d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #224 ; NONEON-NOSVE-NEXT: ret %load = call <4 x double> @llvm.masked.load.v4f64(ptr %src, i32 8, <4 x i1> %mask, <4 x double> zeroinitializer) ret <4 x double> %load @@ -1249,34 +2909,51 @@ define <3 x i32> @masked_load_zext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; ; NONEON-NOSVE-LABEL: masked_load_zext_v3i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #16 -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 ; NONEON-NOSVE-NEXT: and w8, w1, #0x1 ; NONEON-NOSVE-NEXT: bfi w8, w2, #1, #1 ; NONEON-NOSVE-NEXT: bfi w8, w3, #2, #1 ; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB13_2 ; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load -; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: ldrh w9, [x0] +; NONEON-NOSVE-NEXT: stur wzr, [sp, #66] +; NONEON-NOSVE-NEXT: strh w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB13_3 ; NONEON-NOSVE-NEXT: b .LBB13_4 ; NONEON-NOSVE-NEXT: .LBB13_2: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI13_0 +; NONEON-NOSVE-NEXT: ldr d0, [x9, :lo12:.LCPI13_0] ; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB13_4 ; NONEON-NOSVE-NEXT: .LBB13_3: // %cond.load1 -; NONEON-NOSVE-NEXT: mov v1.16b, v0.16b -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[1], [x9] -; NONEON-NOSVE-NEXT: mov v1.h[2], v0.h[2] -; NONEON-NOSVE-NEXT: fmov d0, d1 +; NONEON-NOSVE-NEXT: ldrh w9, [x0, #2] +; NONEON-NOSVE-NEXT: str d0, [sp, #48] +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: strh w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] ; NONEON-NOSVE-NEXT: .LBB13_4: // %else2 ; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB13_6 ; NONEON-NOSVE-NEXT: // %bb.5: // %cond.load4 -; NONEON-NOSVE-NEXT: mov v0.h[1], v0.h[1] -; NONEON-NOSVE-NEXT: add x8, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] +; NONEON-NOSVE-NEXT: ldrh w8, [x0, #4] +; NONEON-NOSVE-NEXT: str d0, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] ; NONEON-NOSVE-NEXT: .LBB13_6: // %else5 -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer) %extend = zext <3 x i16> %load_value to <3 x i32> @@ -1307,34 +2984,51 @@ define <3 x i32> @masked_load_sext_v3i32(ptr %load_ptr, <3 x i1> %pm) { ; ; NONEON-NOSVE-LABEL: masked_load_sext_v3i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #16 -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 ; NONEON-NOSVE-NEXT: and w8, w1, #0x1 ; NONEON-NOSVE-NEXT: bfi w8, w2, #1, #1 ; NONEON-NOSVE-NEXT: bfi w8, w3, #2, #1 ; NONEON-NOSVE-NEXT: tbz w8, #0, .LBB14_2 ; NONEON-NOSVE-NEXT: // %bb.1: // %cond.load -; NONEON-NOSVE-NEXT: ldr h0, [x0] +; NONEON-NOSVE-NEXT: ldrh w9, [x0] +; NONEON-NOSVE-NEXT: stur wzr, [sp, #66] +; NONEON-NOSVE-NEXT: strh w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #64] ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB14_3 ; NONEON-NOSVE-NEXT: b .LBB14_4 ; NONEON-NOSVE-NEXT: .LBB14_2: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x9, .LCPI14_0 +; NONEON-NOSVE-NEXT: ldr d0, [x9, :lo12:.LCPI14_0] ; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB14_4 ; NONEON-NOSVE-NEXT: .LBB14_3: // %cond.load1 -; NONEON-NOSVE-NEXT: mov v1.16b, v0.16b -; NONEON-NOSVE-NEXT: add x9, x0, #2 -; NONEON-NOSVE-NEXT: ld1 { v1.h }[1], [x9] -; NONEON-NOSVE-NEXT: mov v1.h[2], v0.h[2] -; NONEON-NOSVE-NEXT: fmov d0, d1 +; NONEON-NOSVE-NEXT: ldrh w9, [x0, #2] +; NONEON-NOSVE-NEXT: str d0, [sp, #48] +; NONEON-NOSVE-NEXT: strh w9, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: strh w9, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: strh w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] ; NONEON-NOSVE-NEXT: .LBB14_4: // %else2 ; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB14_6 ; NONEON-NOSVE-NEXT: // %bb.5: // %cond.load4 -; NONEON-NOSVE-NEXT: mov v0.h[1], v0.h[1] -; NONEON-NOSVE-NEXT: add x8, x0, #4 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] +; NONEON-NOSVE-NEXT: ldrh w8, [x0, #4] +; NONEON-NOSVE-NEXT: str d0, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] ; NONEON-NOSVE-NEXT: .LBB14_6: // %else5 -; NONEON-NOSVE-NEXT: sshll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: add sp, sp, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %load_value = tail call <3 x i16> @llvm.masked.load.v3i16.p0(ptr %load_ptr, i32 4, <3 x i1> %pm, <3 x i16> zeroinitializer) %extend = sext <3 x i16> %load_value to <3 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll index 0904399558aee1..a79ce9db9abfde 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll @@ -23,13 +23,21 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI0_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB0_5 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB0_6 @@ -38,6 +46,7 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB0_3: // %else4 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB0_8 ; NONEON-NOSVE-NEXT: .LBB0_4: // %else6 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB0_5: // %cond.store ; NONEON-NOSVE-NEXT: strb wzr, [x0] @@ -50,6 +59,7 @@ define void @masked_store_v4i8(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB0_4 ; NONEON-NOSVE-NEXT: .LBB0_8: // %cond.store5 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #3] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v4i8(<4 x i8> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void @@ -69,14 +79,39 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI1_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI1_0] -; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w8, s0 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB1_9 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #6] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: and w9, w9, #0x8 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: and w10, w10, #0x2 +; NONEON-NOSVE-NEXT: and w12, w12, #0x10 +; NONEON-NOSVE-NEXT: bfxil w10, w11, #0, #1 +; NONEON-NOSVE-NEXT: and w11, w13, #0x20 +; NONEON-NOSVE-NEXT: orr w8, w8, w12 +; NONEON-NOSVE-NEXT: and w12, w14, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: orr w10, w11, w12 +; NONEON-NOSVE-NEXT: orr w8, w8, w10 +; NONEON-NOSVE-NEXT: and w9, w9, #0x80 +; NONEON-NOSVE-NEXT: add w9, w8, w9 +; NONEON-NOSVE-NEXT: and w8, w9, #0xff +; NONEON-NOSVE-NEXT: tbnz w9, #0, .LBB1_9 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB1_10 ; NONEON-NOSVE-NEXT: .LBB1_2: // %else2 @@ -92,6 +127,7 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB1_7: // %else12 ; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB1_16 ; NONEON-NOSVE-NEXT: .LBB1_8: // %else14 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB1_9: // %cond.store ; NONEON-NOSVE-NEXT: strb wzr, [x0] @@ -116,6 +152,7 @@ define void @masked_store_v8i8(ptr %dst, <8 x i1> %mask) { ; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB1_8 ; NONEON-NOSVE-NEXT: .LBB1_16: // %cond.store13 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #7] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v8i8(<8 x i8> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask) ret void @@ -135,15 +172,89 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI2_0 -; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: addv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #44] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w12, w13 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w10, w14 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w10 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB2_17 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB2_18 @@ -176,6 +287,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB2_15: // %else28 ; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB2_32 ; NONEON-NOSVE-NEXT: .LBB2_16: // %else30 +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB2_17: // %cond.store ; NONEON-NOSVE-NEXT: strb wzr, [x0] @@ -224,6 +336,7 @@ define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) { ; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB2_16 ; NONEON-NOSVE-NEXT: .LBB2_32: // %cond.store29 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #15] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask) ret void @@ -308,241 +421,328 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] -; NONEON-NOSVE-NEXT: fmov s1, w1 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #80] -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] -; NONEON-NOSVE-NEXT: mov v1.b[1], w2 -; NONEON-NOSVE-NEXT: mov v0.b[1], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp] -; NONEON-NOSVE-NEXT: mov v1.b[2], w3 -; NONEON-NOSVE-NEXT: mov v0.b[2], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] -; NONEON-NOSVE-NEXT: mov v1.b[3], w4 -; NONEON-NOSVE-NEXT: mov v0.b[3], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] -; NONEON-NOSVE-NEXT: mov v1.b[4], w5 -; NONEON-NOSVE-NEXT: mov v0.b[4], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] -; NONEON-NOSVE-NEXT: mov v1.b[5], w6 -; NONEON-NOSVE-NEXT: mov v0.b[5], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] -; NONEON-NOSVE-NEXT: mov v1.b[6], w7 -; NONEON-NOSVE-NEXT: mov v0.b[6], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] -; NONEON-NOSVE-NEXT: mov v1.b[7], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] -; NONEON-NOSVE-NEXT: mov v0.b[7], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] -; NONEON-NOSVE-NEXT: mov v1.b[8], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] -; NONEON-NOSVE-NEXT: mov v0.b[8], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] -; NONEON-NOSVE-NEXT: mov v1.b[9], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #24] -; NONEON-NOSVE-NEXT: mov v0.b[9], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #152] -; NONEON-NOSVE-NEXT: mov v1.b[10], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] -; NONEON-NOSVE-NEXT: mov v0.b[10], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] -; NONEON-NOSVE-NEXT: mov v1.b[11], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #40] -; NONEON-NOSVE-NEXT: mov v0.b[11], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #168] -; NONEON-NOSVE-NEXT: mov v1.b[12], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #48] -; NONEON-NOSVE-NEXT: mov v0.b[12], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] -; NONEON-NOSVE-NEXT: mov v1.b[13], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #56] -; NONEON-NOSVE-NEXT: mov v0.b[13], w8 +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: str x29, [sp, #64] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: .cfi_offset w29, -16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #216] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #152] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #272] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #176] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #160] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: and w8, w9, #0x1 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #264] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w9, #0x80 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #256] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: and w8, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #248] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: and w8, w9, #0x20 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #240] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: and w8, w9, #0x10 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #232] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: and w8, w9, #0x8 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #224] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: and w8, w9, #0x4 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #208] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: and w8, w9, #0x2 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #200] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: and w8, w9, #0x80 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #192] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] ; NONEON-NOSVE-NEXT: ldr w8, [sp, #184] -; NONEON-NOSVE-NEXT: mov v1.b[14], w9 -; NONEON-NOSVE-NEXT: ldr w9, [sp, #64] -; NONEON-NOSVE-NEXT: mov v0.b[14], w8 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] -; NONEON-NOSVE-NEXT: mov v1.b[15], w9 -; NONEON-NOSVE-NEXT: mov v0.b[15], w8 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI3_0 -; NONEON-NOSVE-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] -; NONEON-NOSVE-NEXT: shl v1.16b, v1.16b, #7 -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: cmlt v1.16b, v1.16b, #0 -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: and v1.16b, v1.16b, v2.16b -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: ext v3.16b, v1.16b, v1.16b, #8 -; NONEON-NOSVE-NEXT: ext v2.16b, v0.16b, v0.16b, #8 -; NONEON-NOSVE-NEXT: zip1 v1.16b, v1.16b, v3.16b -; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: addv h1, v1.8h -; NONEON-NOSVE-NEXT: addv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w8, s1 -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: bfi w8, w9, #16, #16 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB3_33 +; NONEON-NOSVE-NEXT: and w9, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: strb w9, [sp, #22] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #168] +; NONEON-NOSVE-NEXT: and w10, w10, #0x20 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: strb w10, [sp, #21] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: and w8, w11, #0x8 +; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: and w8, w9, #0x4 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #88] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: and w8, w10, #0x2 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #136] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: and w8, w9, #0x1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #144] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w1, #0x1 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #104] +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #80] +; NONEON-NOSVE-NEXT: sbfx w11, w11, #0, #1 +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: and w8, w9, #0x80 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #128] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: and w8, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #120] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] +; NONEON-NOSVE-NEXT: and w9, w9, #0x20 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: strb w9, [sp, #13] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #96] +; NONEON-NOSVE-NEXT: and w10, w10, #0x10 +; NONEON-NOSVE-NEXT: zip1 v2.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: strb w10, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: and w8, w11, #0x4 +; NONEON-NOSVE-NEXT: sbfx w10, w12, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: and w8, w9, #0x2 +; NONEON-NOSVE-NEXT: sbfx w9, w7, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: and w8, w10, #0x80 +; NONEON-NOSVE-NEXT: sbfx w10, w6, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #7] +; NONEON-NOSVE-NEXT: and w8, w9, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w5, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #6] +; NONEON-NOSVE-NEXT: and w8, w10, #0x20 +; NONEON-NOSVE-NEXT: sbfx w10, w4, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #5] +; NONEON-NOSVE-NEXT: and w8, w9, #0x10 +; NONEON-NOSVE-NEXT: sbfx w9, w3, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #4] +; NONEON-NOSVE-NEXT: and w8, w10, #0x8 +; NONEON-NOSVE-NEXT: sbfx w10, w2, #0, #1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #3] +; NONEON-NOSVE-NEXT: and w8, w9, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #2] +; NONEON-NOSVE-NEXT: and w8, w10, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldr q0, [sp] +; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #40] +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #34] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w12, w13 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #42] +; NONEON-NOSVE-NEXT: add w9, w9, w10 +; NONEON-NOSVE-NEXT: add w10, w12, w11 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #44] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w12, w13, w14 +; NONEON-NOSVE-NEXT: add w14, w15, w16 +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #62] +; NONEON-NOSVE-NEXT: add w10, w10, w12 +; NONEON-NOSVE-NEXT: add w11, w14, w11 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #46] +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w8, w13 +; NONEON-NOSVE-NEXT: add w8, w9, w12 +; NONEON-NOSVE-NEXT: bfi w8, w10, #16, #16 +; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB3_34 ; NONEON-NOSVE-NEXT: // %bb.1: // %else -; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_34 +; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB3_35 ; NONEON-NOSVE-NEXT: .LBB3_2: // %else2 -; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_35 +; NONEON-NOSVE-NEXT: tbnz w8, #2, .LBB3_36 ; NONEON-NOSVE-NEXT: .LBB3_3: // %else4 -; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_36 +; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB3_37 ; NONEON-NOSVE-NEXT: .LBB3_4: // %else6 -; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_37 +; NONEON-NOSVE-NEXT: tbnz w8, #4, .LBB3_38 ; NONEON-NOSVE-NEXT: .LBB3_5: // %else8 -; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_38 +; NONEON-NOSVE-NEXT: tbnz w8, #5, .LBB3_39 ; NONEON-NOSVE-NEXT: .LBB3_6: // %else10 -; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_39 +; NONEON-NOSVE-NEXT: tbnz w8, #6, .LBB3_40 ; NONEON-NOSVE-NEXT: .LBB3_7: // %else12 -; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_40 +; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB3_41 ; NONEON-NOSVE-NEXT: .LBB3_8: // %else14 -; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_41 +; NONEON-NOSVE-NEXT: tbnz w8, #8, .LBB3_42 ; NONEON-NOSVE-NEXT: .LBB3_9: // %else16 -; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_42 +; NONEON-NOSVE-NEXT: tbnz w8, #9, .LBB3_43 ; NONEON-NOSVE-NEXT: .LBB3_10: // %else18 -; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_43 +; NONEON-NOSVE-NEXT: tbnz w8, #10, .LBB3_44 ; NONEON-NOSVE-NEXT: .LBB3_11: // %else20 -; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_44 +; NONEON-NOSVE-NEXT: tbnz w8, #11, .LBB3_45 ; NONEON-NOSVE-NEXT: .LBB3_12: // %else22 -; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_45 +; NONEON-NOSVE-NEXT: tbnz w8, #12, .LBB3_46 ; NONEON-NOSVE-NEXT: .LBB3_13: // %else24 -; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_46 +; NONEON-NOSVE-NEXT: tbnz w8, #13, .LBB3_47 ; NONEON-NOSVE-NEXT: .LBB3_14: // %else26 -; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_47 +; NONEON-NOSVE-NEXT: tbnz w8, #14, .LBB3_48 ; NONEON-NOSVE-NEXT: .LBB3_15: // %else28 -; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_48 +; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB3_49 ; NONEON-NOSVE-NEXT: .LBB3_16: // %else30 -; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_49 +; NONEON-NOSVE-NEXT: tbnz w8, #16, .LBB3_50 ; NONEON-NOSVE-NEXT: .LBB3_17: // %else32 -; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_50 +; NONEON-NOSVE-NEXT: tbnz w8, #17, .LBB3_51 ; NONEON-NOSVE-NEXT: .LBB3_18: // %else34 -; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_51 +; NONEON-NOSVE-NEXT: tbnz w8, #18, .LBB3_52 ; NONEON-NOSVE-NEXT: .LBB3_19: // %else36 -; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_52 +; NONEON-NOSVE-NEXT: tbnz w8, #19, .LBB3_53 ; NONEON-NOSVE-NEXT: .LBB3_20: // %else38 -; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_53 +; NONEON-NOSVE-NEXT: tbnz w8, #20, .LBB3_54 ; NONEON-NOSVE-NEXT: .LBB3_21: // %else40 -; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_54 +; NONEON-NOSVE-NEXT: tbnz w8, #21, .LBB3_55 ; NONEON-NOSVE-NEXT: .LBB3_22: // %else42 -; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_55 +; NONEON-NOSVE-NEXT: tbnz w8, #22, .LBB3_56 ; NONEON-NOSVE-NEXT: .LBB3_23: // %else44 -; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_56 +; NONEON-NOSVE-NEXT: tbnz w8, #23, .LBB3_57 ; NONEON-NOSVE-NEXT: .LBB3_24: // %else46 -; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_57 +; NONEON-NOSVE-NEXT: tbnz w8, #24, .LBB3_58 ; NONEON-NOSVE-NEXT: .LBB3_25: // %else48 -; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_58 +; NONEON-NOSVE-NEXT: tbnz w8, #25, .LBB3_59 ; NONEON-NOSVE-NEXT: .LBB3_26: // %else50 -; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_59 +; NONEON-NOSVE-NEXT: tbnz w8, #26, .LBB3_60 ; NONEON-NOSVE-NEXT: .LBB3_27: // %else52 -; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_60 +; NONEON-NOSVE-NEXT: tbnz w8, #27, .LBB3_61 ; NONEON-NOSVE-NEXT: .LBB3_28: // %else54 -; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_61 +; NONEON-NOSVE-NEXT: tbnz w8, #28, .LBB3_62 ; NONEON-NOSVE-NEXT: .LBB3_29: // %else56 -; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_62 +; NONEON-NOSVE-NEXT: tbnz w8, #29, .LBB3_63 ; NONEON-NOSVE-NEXT: .LBB3_30: // %else58 -; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_63 +; NONEON-NOSVE-NEXT: tbnz w8, #30, .LBB3_64 ; NONEON-NOSVE-NEXT: .LBB3_31: // %else60 -; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_64 -; NONEON-NOSVE-NEXT: .LBB3_32: // %else62 +; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_33 +; NONEON-NOSVE-NEXT: .LBB3_32: // %cond.store61 +; NONEON-NOSVE-NEXT: strb wzr, [x0, #31] +; NONEON-NOSVE-NEXT: .LBB3_33: // %else62 +; NONEON-NOSVE-NEXT: ldr x29, [sp, #64] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret -; NONEON-NOSVE-NEXT: .LBB3_33: // %cond.store +; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.store ; NONEON-NOSVE-NEXT: strb wzr, [x0] ; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB3_2 -; NONEON-NOSVE-NEXT: .LBB3_34: // %cond.store1 +; NONEON-NOSVE-NEXT: .LBB3_35: // %cond.store1 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #1] ; NONEON-NOSVE-NEXT: tbz w8, #2, .LBB3_3 -; NONEON-NOSVE-NEXT: .LBB3_35: // %cond.store3 +; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.store3 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #2] ; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB3_4 -; NONEON-NOSVE-NEXT: .LBB3_36: // %cond.store5 +; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.store5 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #3] ; NONEON-NOSVE-NEXT: tbz w8, #4, .LBB3_5 -; NONEON-NOSVE-NEXT: .LBB3_37: // %cond.store7 +; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.store7 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #4] ; NONEON-NOSVE-NEXT: tbz w8, #5, .LBB3_6 -; NONEON-NOSVE-NEXT: .LBB3_38: // %cond.store9 +; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.store9 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #5] ; NONEON-NOSVE-NEXT: tbz w8, #6, .LBB3_7 -; NONEON-NOSVE-NEXT: .LBB3_39: // %cond.store11 +; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.store11 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #6] ; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB3_8 -; NONEON-NOSVE-NEXT: .LBB3_40: // %cond.store13 +; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.store13 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #7] ; NONEON-NOSVE-NEXT: tbz w8, #8, .LBB3_9 -; NONEON-NOSVE-NEXT: .LBB3_41: // %cond.store15 +; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.store15 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #8] ; NONEON-NOSVE-NEXT: tbz w8, #9, .LBB3_10 -; NONEON-NOSVE-NEXT: .LBB3_42: // %cond.store17 +; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.store17 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #9] ; NONEON-NOSVE-NEXT: tbz w8, #10, .LBB3_11 -; NONEON-NOSVE-NEXT: .LBB3_43: // %cond.store19 +; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.store19 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #10] ; NONEON-NOSVE-NEXT: tbz w8, #11, .LBB3_12 -; NONEON-NOSVE-NEXT: .LBB3_44: // %cond.store21 +; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.store21 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #11] ; NONEON-NOSVE-NEXT: tbz w8, #12, .LBB3_13 -; NONEON-NOSVE-NEXT: .LBB3_45: // %cond.store23 +; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.store23 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #12] ; NONEON-NOSVE-NEXT: tbz w8, #13, .LBB3_14 -; NONEON-NOSVE-NEXT: .LBB3_46: // %cond.store25 +; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.store25 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #13] ; NONEON-NOSVE-NEXT: tbz w8, #14, .LBB3_15 -; NONEON-NOSVE-NEXT: .LBB3_47: // %cond.store27 +; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.store27 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #14] ; NONEON-NOSVE-NEXT: tbz w8, #15, .LBB3_16 -; NONEON-NOSVE-NEXT: .LBB3_48: // %cond.store29 +; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.store29 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #15] ; NONEON-NOSVE-NEXT: tbz w8, #16, .LBB3_17 -; NONEON-NOSVE-NEXT: .LBB3_49: // %cond.store31 +; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.store31 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #16] ; NONEON-NOSVE-NEXT: tbz w8, #17, .LBB3_18 -; NONEON-NOSVE-NEXT: .LBB3_50: // %cond.store33 +; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.store33 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #17] ; NONEON-NOSVE-NEXT: tbz w8, #18, .LBB3_19 -; NONEON-NOSVE-NEXT: .LBB3_51: // %cond.store35 +; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.store35 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #18] ; NONEON-NOSVE-NEXT: tbz w8, #19, .LBB3_20 -; NONEON-NOSVE-NEXT: .LBB3_52: // %cond.store37 +; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.store37 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #19] ; NONEON-NOSVE-NEXT: tbz w8, #20, .LBB3_21 -; NONEON-NOSVE-NEXT: .LBB3_53: // %cond.store39 +; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.store39 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #20] ; NONEON-NOSVE-NEXT: tbz w8, #21, .LBB3_22 -; NONEON-NOSVE-NEXT: .LBB3_54: // %cond.store41 +; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.store41 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #21] ; NONEON-NOSVE-NEXT: tbz w8, #22, .LBB3_23 -; NONEON-NOSVE-NEXT: .LBB3_55: // %cond.store43 +; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.store43 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #22] ; NONEON-NOSVE-NEXT: tbz w8, #23, .LBB3_24 -; NONEON-NOSVE-NEXT: .LBB3_56: // %cond.store45 +; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.store45 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #23] ; NONEON-NOSVE-NEXT: tbz w8, #24, .LBB3_25 -; NONEON-NOSVE-NEXT: .LBB3_57: // %cond.store47 +; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.store47 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #24] ; NONEON-NOSVE-NEXT: tbz w8, #25, .LBB3_26 -; NONEON-NOSVE-NEXT: .LBB3_58: // %cond.store49 +; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.store49 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #25] ; NONEON-NOSVE-NEXT: tbz w8, #26, .LBB3_27 -; NONEON-NOSVE-NEXT: .LBB3_59: // %cond.store51 +; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.store51 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #26] ; NONEON-NOSVE-NEXT: tbz w8, #27, .LBB3_28 -; NONEON-NOSVE-NEXT: .LBB3_60: // %cond.store53 +; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.store53 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #27] ; NONEON-NOSVE-NEXT: tbz w8, #28, .LBB3_29 -; NONEON-NOSVE-NEXT: .LBB3_61: // %cond.store55 +; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.store55 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #28] ; NONEON-NOSVE-NEXT: tbz w8, #29, .LBB3_30 -; NONEON-NOSVE-NEXT: .LBB3_62: // %cond.store57 +; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.store57 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #29] ; NONEON-NOSVE-NEXT: tbz w8, #30, .LBB3_31 -; NONEON-NOSVE-NEXT: .LBB3_63: // %cond.store59 +; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.store59 ; NONEON-NOSVE-NEXT: strb wzr, [x0, #30] -; NONEON-NOSVE-NEXT: tbz w8, #31, .LBB3_32 -; NONEON-NOSVE-NEXT: .LBB3_64: // %cond.store61 -; NONEON-NOSVE-NEXT: strb wzr, [x0, #31] -; NONEON-NOSVE-NEXT: ret +; NONEON-NOSVE-NEXT: tbnz w8, #31, .LBB3_32 +; NONEON-NOSVE-NEXT: b .LBB3_33 call void @llvm.masked.store.v32i8(<32 x i8> zeroinitializer, ptr %dst, i32 8, <32 x i1> %mask) ret void } @@ -571,17 +771,18 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI4_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] -; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: bfxil w8, w9, #0, #1 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB4_3 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB4_4 ; NONEON-NOSVE-NEXT: .LBB4_2: // %else2 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB4_3: // %cond.store ; NONEON-NOSVE-NEXT: fmov s0, wzr @@ -590,6 +791,7 @@ define void @masked_store_v2f16(ptr %dst, <2 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB4_4: // %cond.store1 ; NONEON-NOSVE-NEXT: fmov s0, wzr ; NONEON-NOSVE-NEXT: str h0, [x0, #2] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v2f16(<2 x half> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask) ret void @@ -609,13 +811,21 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI5_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI5_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB5_5 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB5_6 @@ -624,6 +834,7 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB5_3: // %else4 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB5_8 ; NONEON-NOSVE-NEXT: .LBB5_4: // %else6 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB5_5: // %cond.store ; NONEON-NOSVE-NEXT: fmov s0, wzr @@ -640,6 +851,7 @@ define void @masked_store_v4f16(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB5_8: // %cond.store5 ; NONEON-NOSVE-NEXT: fmov s0, wzr ; NONEON-NOSVE-NEXT: str h0, [x0, #6] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v4f16(<4 x half> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void @@ -660,14 +872,39 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI6_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI6_0] -; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w8, s0 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB6_9 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #6] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: and w9, w9, #0x8 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: and w10, w10, #0x2 +; NONEON-NOSVE-NEXT: and w12, w12, #0x10 +; NONEON-NOSVE-NEXT: bfxil w10, w11, #0, #1 +; NONEON-NOSVE-NEXT: and w11, w13, #0x20 +; NONEON-NOSVE-NEXT: orr w8, w8, w12 +; NONEON-NOSVE-NEXT: and w12, w14, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: orr w10, w11, w12 +; NONEON-NOSVE-NEXT: orr w8, w8, w10 +; NONEON-NOSVE-NEXT: and w9, w9, #0x80 +; NONEON-NOSVE-NEXT: add w9, w8, w9 +; NONEON-NOSVE-NEXT: and w8, w9, #0xff +; NONEON-NOSVE-NEXT: tbnz w9, #0, .LBB6_9 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB6_10 ; NONEON-NOSVE-NEXT: .LBB6_2: // %else2 @@ -683,6 +920,7 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB6_7: // %else12 ; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB6_16 ; NONEON-NOSVE-NEXT: .LBB6_8: // %else14 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB6_9: // %cond.store ; NONEON-NOSVE-NEXT: fmov s0, wzr @@ -715,6 +953,7 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB6_16: // %cond.store13 ; NONEON-NOSVE-NEXT: fmov s0, wzr ; NONEON-NOSVE-NEXT: str h0, [x0, #14] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask) ret void @@ -743,15 +982,89 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.16b, v0.16b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI7_0 -; NONEON-NOSVE-NEXT: ldr q1, [x8, :lo12:.LCPI7_0] -; NONEON-NOSVE-NEXT: cmlt v0.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: and v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: and w8, w8, #0x1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x80 +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x40 +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: addv h0, v0.8h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #44] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: add w10, w12, w13 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w10, w14 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w10 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB7_17 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB7_18 @@ -784,6 +1097,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB7_15: // %else28 ; NONEON-NOSVE-NEXT: tbnz w8, #15, .LBB7_32 ; NONEON-NOSVE-NEXT: .LBB7_16: // %else30 +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB7_17: // %cond.store ; NONEON-NOSVE-NEXT: fmov s0, wzr @@ -848,6 +1162,7 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB7_32: // %cond.store29 ; NONEON-NOSVE-NEXT: fmov s0, wzr ; NONEON-NOSVE-NEXT: str h0, [x0, #30] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v16f16(<16 x half> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask) ret void @@ -868,13 +1183,21 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI8_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI8_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB8_5 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB8_6 @@ -883,6 +1206,7 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB8_3: // %else4 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB8_8 ; NONEON-NOSVE-NEXT: .LBB8_4: // %else6 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB8_5: // %cond.store ; NONEON-NOSVE-NEXT: str wzr, [x0] @@ -895,6 +1219,7 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB8_4 ; NONEON-NOSVE-NEXT: .LBB8_8: // %cond.store5 ; NONEON-NOSVE-NEXT: str wzr, [x0, #12] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void @@ -949,14 +1274,39 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.8b, v0.8b, #7 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI9_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI9_0] -; NONEON-NOSVE-NEXT: cmlt v0.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv b0, v0.8b -; NONEON-NOSVE-NEXT: fmov w8, s0 -; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB9_9 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w10, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w12, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w13, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w14, [sp, #6] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: ldrb w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w12, w12, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w13, w13, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x4 +; NONEON-NOSVE-NEXT: and w9, w9, #0x8 +; NONEON-NOSVE-NEXT: sbfx w14, w14, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: and w10, w10, #0x2 +; NONEON-NOSVE-NEXT: and w12, w12, #0x10 +; NONEON-NOSVE-NEXT: bfxil w10, w11, #0, #1 +; NONEON-NOSVE-NEXT: and w11, w13, #0x20 +; NONEON-NOSVE-NEXT: orr w8, w8, w12 +; NONEON-NOSVE-NEXT: and w12, w14, #0x40 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: orr w8, w10, w8 +; NONEON-NOSVE-NEXT: orr w10, w11, w12 +; NONEON-NOSVE-NEXT: orr w8, w8, w10 +; NONEON-NOSVE-NEXT: and w9, w9, #0x80 +; NONEON-NOSVE-NEXT: add w9, w8, w9 +; NONEON-NOSVE-NEXT: and w8, w9, #0xff +; NONEON-NOSVE-NEXT: tbnz w9, #0, .LBB9_9 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB9_10 ; NONEON-NOSVE-NEXT: .LBB9_2: // %else2 @@ -972,6 +1322,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB9_7: // %else12 ; NONEON-NOSVE-NEXT: tbnz w8, #7, .LBB9_16 ; NONEON-NOSVE-NEXT: .LBB9_8: // %else14 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB9_9: // %cond.store ; NONEON-NOSVE-NEXT: str wzr, [x0] @@ -996,6 +1347,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; NONEON-NOSVE-NEXT: tbz w8, #7, .LBB9_8 ; NONEON-NOSVE-NEXT: .LBB9_16: // %cond.store13 ; NONEON-NOSVE-NEXT: str wzr, [x0, #28] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask) ret void @@ -1016,23 +1368,25 @@ define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #31 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI10_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI10_0] -; NONEON-NOSVE-NEXT: cmlt v0.2s, v0.2s, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addp v0.2s, v0.2s, v0.2s -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: bfxil w8, w9, #0, #1 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB10_3 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB10_4 ; NONEON-NOSVE-NEXT: .LBB10_2: // %else2 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB10_3: // %cond.store ; NONEON-NOSVE-NEXT: str xzr, [x0] ; NONEON-NOSVE-NEXT: tbz w8, #1, .LBB10_2 ; NONEON-NOSVE-NEXT: .LBB10_4: // %cond.store1 ; NONEON-NOSVE-NEXT: str xzr, [x0, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask) ret void @@ -1061,13 +1415,21 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) { ; ; NONEON-NOSVE-LABEL: masked_store_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #15 -; NONEON-NOSVE-NEXT: adrp x8, .LCPI11_0 -; NONEON-NOSVE-NEXT: ldr d1, [x8, :lo12:.LCPI11_0] -; NONEON-NOSVE-NEXT: cmlt v0.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: and v0.8b, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: addv h0, v0.4h -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w11, [sp] +; NONEON-NOSVE-NEXT: sbfx w8, w8, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #0, #1 +; NONEON-NOSVE-NEXT: sbfx w10, w10, #0, #1 +; NONEON-NOSVE-NEXT: and w8, w8, #0x2 +; NONEON-NOSVE-NEXT: and w9, w9, #0x4 +; NONEON-NOSVE-NEXT: and w10, w10, #0x8 +; NONEON-NOSVE-NEXT: bfxil w8, w11, #0, #1 +; NONEON-NOSVE-NEXT: orr w9, w9, w10 +; NONEON-NOSVE-NEXT: orr w8, w8, w9 ; NONEON-NOSVE-NEXT: tbnz w8, #0, .LBB11_5 ; NONEON-NOSVE-NEXT: // %bb.1: // %else ; NONEON-NOSVE-NEXT: tbnz w8, #1, .LBB11_6 @@ -1076,6 +1438,7 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: .LBB11_3: // %else4 ; NONEON-NOSVE-NEXT: tbnz w8, #3, .LBB11_8 ; NONEON-NOSVE-NEXT: .LBB11_4: // %else6 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret ; NONEON-NOSVE-NEXT: .LBB11_5: // %cond.store ; NONEON-NOSVE-NEXT: str xzr, [x0] @@ -1088,6 +1451,7 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) { ; NONEON-NOSVE-NEXT: tbz w8, #3, .LBB11_4 ; NONEON-NOSVE-NEXT: .LBB11_8: // %cond.store5 ; NONEON-NOSVE-NEXT: str xzr, [x0, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret call void @llvm.masked.store.v4f64(<4 x double> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll index 6a6b47e815ac16..dbdf5f25029998 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll @@ -18,11 +18,22 @@ define void @add_v4i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: ldr s1, [x1] -; NONEON-NOSVE-NEXT: uaddl v0.8h, v0.8b, v1.8b -; NONEON-NOSVE-NEXT: uzp1 v0.8b, v0.8b, v0.8b -; NONEON-NOSVE-NEXT: str s0, [x0] +; NONEON-NOSVE-NEXT: ldrb w8, [x0, #3] +; NONEON-NOSVE-NEXT: ldrb w9, [x1, #3] +; NONEON-NOSVE-NEXT: ldrb w10, [x0, #2] +; NONEON-NOSVE-NEXT: ldrb w11, [x0, #1] +; NONEON-NOSVE-NEXT: ldrb w12, [x1, #2] +; NONEON-NOSVE-NEXT: ldrb w13, [x0] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: ldrb w14, [x1, #1] +; NONEON-NOSVE-NEXT: ldrb w9, [x1] +; NONEON-NOSVE-NEXT: add w10, w10, w12 +; NONEON-NOSVE-NEXT: strb w8, [x0, #3] +; NONEON-NOSVE-NEXT: add w8, w11, w14 +; NONEON-NOSVE-NEXT: add w9, w13, w9 +; NONEON-NOSVE-NEXT: strb w10, [x0, #2] +; NONEON-NOSVE-NEXT: strb w8, [x0, #1] +; NONEON-NOSVE-NEXT: strb w9, [x0] ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i8>, ptr %a %op2 = load <4 x i8>, ptr %b @@ -42,10 +53,46 @@ define void @add_v8i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ldr d1, [x1] -; NONEON-NOSVE-NEXT: add v0.8b, v0.8b, v1.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [x1] +; NONEON-NOSVE-NEXT: ldr d1, [x0] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i8>, ptr %a %op2 = load <8 x i8>, ptr %b @@ -65,10 +112,77 @@ define void @add_v16i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i8>, ptr %a %op2 = load <16 x i8>, ptr %b @@ -89,11 +203,143 @@ define void @add_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: add_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v2.16b, v3.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #47] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #71] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -116,17 +362,12 @@ define void @add_v2i16(ptr %a, ptr %b, ptr %c) { ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldrh w8, [x0] ; NONEON-NOSVE-NEXT: ldrh w9, [x1] -; NONEON-NOSVE-NEXT: fmov s0, w8 -; NONEON-NOSVE-NEXT: fmov s1, w9 -; NONEON-NOSVE-NEXT: add x8, x0, #2 -; NONEON-NOSVE-NEXT: add x9, x1, #2 -; NONEON-NOSVE-NEXT: ld1 { v0.h }[2], [x8] -; NONEON-NOSVE-NEXT: ld1 { v1.h }[2], [x9] -; NONEON-NOSVE-NEXT: add v0.2s, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: mov w8, v0.s[1] -; NONEON-NOSVE-NEXT: fmov w9, s0 -; NONEON-NOSVE-NEXT: strh w9, [x0] -; NONEON-NOSVE-NEXT: strh w8, [x0, #2] +; NONEON-NOSVE-NEXT: ldrh w10, [x0, #2] +; NONEON-NOSVE-NEXT: ldrh w11, [x1, #2] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: add w9, w10, w11 +; NONEON-NOSVE-NEXT: strh w8, [x0] +; NONEON-NOSVE-NEXT: strh w9, [x0, #2] ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i16>, ptr %a %op2 = load <2 x i16>, ptr %b @@ -146,10 +387,30 @@ define void @add_v4i16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: add_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ldr d1, [x1] -; NONEON-NOSVE-NEXT: add v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [x1] +; NONEON-NOSVE-NEXT: ldr d1, [x0] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i16>, ptr %a %op2 = load <4 x i16>, ptr %b @@ -169,10 +430,45 @@ define void @add_v8i16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: add_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i16>, ptr %a %op2 = load <8 x i16>, ptr %b @@ -193,11 +489,79 @@ define void @add_v16i16(ptr %a, ptr %b, ptr %c) { ; ; NONEON-NOSVE-LABEL: add_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: add v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v2.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #52] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #48] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -218,8 +582,18 @@ define void @abs_v2i32(ptr %a) { ; NONEON-NOSVE-LABEL: abs_v2i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: abs v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i32>, ptr %a %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false) @@ -239,8 +613,25 @@ define void @abs_v4i32(ptr %a) { ; NONEON-NOSVE-LABEL: abs_v4i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: abs v0.4s, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i32>, ptr %a %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false) @@ -260,10 +651,40 @@ define void @abs_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: abs_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: abs v0.4s, v0.4s -; NONEON-NOSVE-NEXT: abs v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w9, w8, mi +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: cmp w8, #0 +; NONEON-NOSVE-NEXT: cneg w8, w8, mi +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false) @@ -283,8 +704,18 @@ define void @abs_v2i64(ptr %a) { ; NONEON-NOSVE-LABEL: abs_v2i64: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: abs v0.2d, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x9, x8, mi +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x i64>, ptr %a %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false) @@ -304,10 +735,26 @@ define void @abs_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: abs_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: abs v0.2d, v0.2d -; NONEON-NOSVE-NEXT: abs v1.2d, v1.2d +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x9, x8, mi +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x9, x8, mi +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: cmp x8, #0 +; NONEON-NOSVE-NEXT: cneg x8, x8, mi +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false) @@ -328,13 +775,32 @@ define void @fadd_v2f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr s0, [x0] -; NONEON-NOSVE-NEXT: ldr s1, [x1] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: str s0, [x0] +; NONEON-NOSVE-NEXT: ldr w8, [x0] +; NONEON-NOSVE-NEXT: str w8, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr w8, [x1] +; NONEON-NOSVE-NEXT: str w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: str d0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: str w8, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x half>, ptr %a %op2 = load <2 x half>, ptr %b @@ -355,13 +821,42 @@ define void @fadd_v4f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ldr d1, [x1] -; NONEON-NOSVE-NEXT: fcvtl v1.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v0.4s, v0.4h -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [x1] +; NONEON-NOSVE-NEXT: ldr d1, [x0] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #26] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x half>, ptr %a %op2 = load <4 x half>, ptr %b @@ -382,17 +877,69 @@ define void @fadd_v8f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fcvtl v2.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v3.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fadd v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v2.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #42] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #40] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #38] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #36] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #34] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x half>, ptr %a %op2 = load <8 x half>, ptr %b @@ -415,25 +962,127 @@ define void @fadd_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fcvtl v4.4s, v0.4h -; NONEON-NOSVE-NEXT: fcvtl v6.4s, v3.4h -; NONEON-NOSVE-NEXT: fcvtl2 v0.4s, v0.8h -; NONEON-NOSVE-NEXT: fcvtl v5.4s, v1.4h -; NONEON-NOSVE-NEXT: fcvtl v7.4s, v2.4h -; NONEON-NOSVE-NEXT: fcvtl2 v1.4s, v1.8h -; NONEON-NOSVE-NEXT: fcvtl2 v3.4s, v3.8h -; NONEON-NOSVE-NEXT: fcvtl2 v2.4s, v2.8h -; NONEON-NOSVE-NEXT: fadd v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: fadd v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v2.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: fcvtn v1.4h, v4.4s -; NONEON-NOSVE-NEXT: fcvtn v3.4h, v5.4s -; NONEON-NOSVE-NEXT: fcvtn2 v1.8h, v0.4s -; NONEON-NOSVE-NEXT: fcvtn2 v3.8h, v2.4s -; NONEON-NOSVE-NEXT: stp q1, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: ldr h1, [sp, #46] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #44] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #94] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #42] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #92] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #58] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #40] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #90] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #56] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #38] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #54] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #36] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #86] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #52] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #34] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #84] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #50] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #32] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #82] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #48] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #14] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #80] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #30] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #12] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #10] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #26] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #8] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #74] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #24] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #6] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #72] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #22] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #4] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #70] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp, #2] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #68] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #18] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldr h1, [sp] +; NONEON-NOSVE-NEXT: fcvt s1, h1 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #66] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #16] +; NONEON-NOSVE-NEXT: fcvt s0, h0 +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: fcvt h0, s0 +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -454,10 +1103,20 @@ define void @fadd_v2f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ldr d1, [x1] -; NONEON-NOSVE-NEXT: fadd v0.2s, v0.2s, v1.2s +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr d0, [x1] +; NONEON-NOSVE-NEXT: ldr d1, [x0] +; NONEON-NOSVE-NEXT: stp d1, d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x float>, ptr %a %op2 = load <2 x float>, ptr %b @@ -478,10 +1137,25 @@ define void @fadd_v4f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fadd v0.4s, v0.4s, v1.4s +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x float>, ptr %a %op2 = load <4 x float>, ptr %b @@ -504,11 +1178,39 @@ define void @fadd_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fadd v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v1.4s, v2.4s, v3.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #60] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #56] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #88] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #52] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #28] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s2, [sp] +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #20] +; NONEON-NOSVE-NEXT: fadd s3, s2, s0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -529,10 +1231,19 @@ define void @fadd_v2f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: fadd v0.2d, v0.2d, v1.2d +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %op1 = load <2 x double>, ptr %a %op2 = load <2 x double>, ptr %b @@ -555,11 +1266,27 @@ define void @fadd_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: fadd_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q3, [x1] -; NONEON-NOSVE-NEXT: ldp q1, q2, [x0] -; NONEON-NOSVE-NEXT: fadd v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: fadd v1.2d, v2.2d, v3.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fadd d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d2, [sp] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #80] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd d3, d2, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll index 03bb899c517b4e..8c23f5f9922da7 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll @@ -19,10 +19,70 @@ define void @test_revbv16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revbv16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev16 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> @@ -43,10 +103,70 @@ define void @test_revbv8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revbv8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev32 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> @@ -67,10 +187,70 @@ define void @test_revbv4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revbv4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev64 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> @@ -91,10 +271,34 @@ define void @test_revhv8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revhv8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev32 v0.8h, v0.8h -; NONEON-NOSVE-NEXT: rev32 v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> @@ -115,10 +319,34 @@ define void @test_revhv8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revhv8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev32 v0.8h, v0.8h -; NONEON-NOSVE-NEXT: rev32 v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x half>, ptr %a %tmp2 = shufflevector <16 x half> %tmp1, <16 x half> undef, <16 x i32> @@ -139,10 +367,34 @@ define void @test_revhv4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revhv4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.8h, v0.8h -; NONEON-NOSVE-NEXT: rev64 v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <16 x i32> @@ -163,10 +415,22 @@ define void @test_revwv4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revwv4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.4s, v0.4s -; NONEON-NOSVE-NEXT: rev64 v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -187,10 +451,22 @@ define void @test_revwv4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revwv4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.4s, v0.4s -; NONEON-NOSVE-NEXT: rev64 v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #40] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #32] +; NONEON-NOSVE-NEXT: stp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x float>, ptr %a %tmp2 = shufflevector <8 x float> %tmp1, <8 x float> undef, <8 x i32> @@ -210,7 +486,42 @@ define <16 x i8> @test_revv16i8(ptr %a) { ; NONEON-NOSVE-LABEL: test_revv16i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i8>, ptr %a %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> @@ -230,10 +541,22 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: test_revwv8i32v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x1] -; NONEON-NOSVE-NEXT: rev64 v0.4s, v0.4s -; NONEON-NOSVE-NEXT: rev64 v1.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #32] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = load <8 x i32>, ptr %b @@ -258,14 +581,58 @@ define void @test_revhv32i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revhv32i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.8h, v0.8h -; NONEON-NOSVE-NEXT: rev64 v1.8h, v1.8h -; NONEON-NOSVE-NEXT: rev64 v2.8h, v2.8h -; NONEON-NOSVE-NEXT: rev64 v3.8h, v3.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: str q1, [sp, #96] +; NONEON-NOSVE-NEXT: str q3, [sp, #64] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] +; NONEON-NOSVE-NEXT: ldr q2, [sp, #48] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #108] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #100] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldr q3, [sp, #112] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #70] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldur w8, [sp, #74] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stur w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #80] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: ror w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: stp q3, q2, [x0] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i16>, ptr %a %tmp2 = shufflevector <32 x i16> %tmp1, <32 x i16> undef, <32 x i32> @@ -285,10 +652,18 @@ define void @test_rev_elts_fail(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_rev_elts_fail: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp] +; NONEON-NOSVE-NEXT: str q1, [sp, #32] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x i64>, ptr %a %tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> @@ -358,12 +733,23 @@ define void @test_revv8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: test_revv8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.4s, v0.4s -; NONEON-NOSVE-NEXT: rev64 v1.4s, v1.4s -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index f254a1f9098f2d..bc6fdd1ecd5a71 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -72,14 +72,82 @@ define void @zip1_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip1_v32i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: zip2 v2.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: str q2, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: strb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #7] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <32 x i8>, ptr %a %tmp2 = load volatile <32 x i8>, ptr %b @@ -212,24 +280,149 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip_v32i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q4, q0, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q5, q1, [x0] -; NONEON-NOSVE-NEXT: ldp q6, q2, [x1, #32] -; NONEON-NOSVE-NEXT: ldp q7, q3, [x1] -; NONEON-NOSVE-NEXT: zip1 v17.8h, v0.8h, v2.8h -; NONEON-NOSVE-NEXT: zip2 v0.8h, v0.8h, v2.8h -; NONEON-NOSVE-NEXT: zip1 v16.8h, v1.8h, v3.8h -; NONEON-NOSVE-NEXT: zip2 v1.8h, v1.8h, v3.8h -; NONEON-NOSVE-NEXT: zip1 v2.8h, v5.8h, v7.8h -; NONEON-NOSVE-NEXT: zip1 v3.8h, v4.8h, v6.8h -; NONEON-NOSVE-NEXT: zip2 v5.8h, v5.8h, v7.8h -; NONEON-NOSVE-NEXT: zip2 v4.8h, v4.8h, v6.8h -; NONEON-NOSVE-NEXT: add v6.8h, v16.8h, v17.8h -; NONEON-NOSVE-NEXT: add v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: add v2.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: stp q6, q0, [x0, #32] -; NONEON-NOSVE-NEXT: stp q1, q2, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #192 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 192 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1, #32] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x1] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #32] +; NONEON-NOSVE-NEXT: stp q3, q5, [sp] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #30] +; NONEON-NOSVE-NEXT: stp q6, q2, [sp, #32] +; NONEON-NOSVE-NEXT: stp q7, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q4, q1, [sp, #96] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #126] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #190] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #188] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #124] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #186] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #76] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #184] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #122] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #182] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #180] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #120] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #178] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #72] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #176] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #118] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #172] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #116] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #68] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #168] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #114] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #66] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #164] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #112] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #64] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #110] +; NONEON-NOSVE-NEXT: strh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldp q3, q2, [sp, #160] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #62] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #108] +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #92] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #60] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #44] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #106] +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #90] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #58] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #42] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #104] +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #88] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #102] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #86] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #54] +; NONEON-NOSVE-NEXT: strh w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #100] +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #84] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: strh w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #36] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #98] +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #82] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #50] +; NONEON-NOSVE-NEXT: strh w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #96] +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #80] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #32] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #128] +; NONEON-NOSVE-NEXT: stp q3, q2, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #192 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i16>, ptr %a %tmp2 = load <32 x i16>, ptr %b @@ -282,14 +475,50 @@ define void @zip1_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip1_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: zip2 v2.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: zip1 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: str q2, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <16 x i16>, ptr %a %tmp2 = load volatile <16 x i16>, ptr %b @@ -326,14 +555,26 @@ define void @zip1_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip1_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: zip2 v2.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: str q2, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #8] +; NONEON-NOSVE-NEXT: stp w10, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #56] +; NONEON-NOSVE-NEXT: stp w10, w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = load volatile <8 x i32>, ptr %b @@ -360,15 +601,28 @@ define void @zip_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] -; NONEON-NOSVE-NEXT: zip1 v4.2d, v1.2d, v3.2d -; NONEON-NOSVE-NEXT: zip1 v5.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: zip2 v1.2d, v1.2d, v3.2d -; NONEON-NOSVE-NEXT: zip2 v0.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: fadd v2.2d, v4.2d, v5.2d -; NONEON-NOSVE-NEXT: fadd v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: stp q2, q0, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fadd d2, d1, d0 +; NONEON-NOSVE-NEXT: ldp d3, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: fadd d0, d3, d0 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd d2, d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d1, [sp] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = load <4 x double>, ptr %b @@ -405,12 +659,29 @@ define void @zip_v4i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: zip1 v2.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: zip2 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: add v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #4] +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldr w9, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x i32>, ptr %a %tmp2 = load <4 x i32>, ptr %b @@ -436,12 +707,22 @@ define void @zip1_v8i32_undef(ptr %a) { ; ; NONEON-NOSVE-LABEL: zip1_v8i32_undef: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: zip2 v1.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: str q1, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w8, w10, [sp, #24] +; NONEON-NOSVE-NEXT: stp w9, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: stp w10, w10, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -465,15 +746,131 @@ define void @trn_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: trn_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: trn1 v4.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: trn2 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: trn1 v1.16b, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: trn2 v2.16b, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: add v0.16b, v4.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #62] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #60] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #58] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #56] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #54] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #52] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #50] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #30] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #28] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #26] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #67] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = load <32 x i8>, ptr %b @@ -500,15 +897,32 @@ define void @trn_v8i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: trn_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: adrp x8, .LCPI8_0 -; NONEON-NOSVE-NEXT: adrp x9, .LCPI8_1 -; NONEON-NOSVE-NEXT: ldr q1, [x0] -; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI8_0] -; NONEON-NOSVE-NEXT: ldr q2, [x9, :lo12:.LCPI8_1] -; NONEON-NOSVE-NEXT: tbl v0.16b, { v1.16b }, v0.16b -; NONEON-NOSVE-NEXT: tbl v1.16b, { v1.16b }, v2.16b -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v1.8h +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #4] +; NONEON-NOSVE-NEXT: add w10, w9, w8 +; NONEON-NOSVE-NEXT: strh w10, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #8] +; NONEON-NOSVE-NEXT: add w10, w11, w10 +; NONEON-NOSVE-NEXT: strh w10, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #12] +; NONEON-NOSVE-NEXT: add w11, w10, w11 +; NONEON-NOSVE-NEXT: add w8, w8, w10 +; NONEON-NOSVE-NEXT: strh w11, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: add w11, w12, w11 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: strh w11, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i16>, ptr %a %tmp2 = load <8 x i16>, ptr %b @@ -535,15 +949,79 @@ define void @trn_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: trn_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: trn1 v4.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: trn2 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: trn1 v1.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: trn2 v2.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: add v0.8h, v4.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v2.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #42] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = load <16 x i16>, ptr %b @@ -570,15 +1048,25 @@ define void @trn_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: trn_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: zip1 v4.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: trn2 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: trn1 v1.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: trn2 v2.4s, v2.4s, v3.4s -; NONEON-NOSVE-NEXT: add v0.4s, v4.4s, v0.4s -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v2.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: ldr q0, [x1, #16] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #48] +; NONEON-NOSVE-NEXT: str q2, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #32] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #64] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp2 = load <8 x i32>, ptr %b @@ -606,15 +1094,25 @@ define void @trn_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: trn_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q1, q3, [x1] -; NONEON-NOSVE-NEXT: zip1 v4.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: zip2 v0.2d, v0.2d, v1.2d -; NONEON-NOSVE-NEXT: zip1 v1.2d, v2.2d, v3.2d -; NONEON-NOSVE-NEXT: zip2 v2.2d, v2.2d, v3.2d -; NONEON-NOSVE-NEXT: fadd v0.2d, v4.2d, v0.2d -; NONEON-NOSVE-NEXT: fadd v1.2d, v1.2d, v2.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q3, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: stp q2, q3, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd d2, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #32] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd d2, d1, d0 +; NONEON-NOSVE-NEXT: ldp d1, d0, [sp] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = load <4 x double>, ptr %b @@ -639,12 +1137,23 @@ define void @trn_v4f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: trn_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: trn1 v2.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: trn2 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: fadd v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #24] +; NONEON-NOSVE-NEXT: fadd s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #8] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #40] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: fadd s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x float>, ptr %a %tmp2 = load <4 x float>, ptr %b @@ -670,14 +1179,24 @@ define void @trn_v8i32_undef(ptr %a) { ; ; NONEON-NOSVE-LABEL: trn_v8i32_undef: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: trn1 v2.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: trn2 v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: trn1 v3.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: trn2 v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: add v0.4s, v2.4s, v0.4s -; NONEON-NOSVE-NEXT: add v1.4s, v3.4s, v1.4s +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -753,14 +1272,82 @@ define void @zip2_v32i8(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: zip2_v32i8: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x1] ; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: zip2 v2.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: zip1 v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: str q2, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: strb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: strb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #7] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #5] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #3] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <32 x i8>, ptr %a %tmp2 = load volatile <32 x i8>, ptr %b @@ -811,14 +1398,50 @@ define void @zip2_v16i16(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: zip2_v16i16: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x1] ; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: zip2 v2.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: zip1 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: str q2, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: strh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <16 x i16>, ptr %a %tmp2 = load volatile <16 x i16>, ptr %b @@ -855,14 +1478,26 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: zip2_v8i32: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] ; NONEON-NOSVE-NEXT: ldr q1, [x1] ; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: zip2 v2.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: str q2, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w11, [sp, #32] +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #8] +; NONEON-NOSVE-NEXT: stp w10, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w10, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldp w8, w11, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: stp w9, w11, [sp, #56] +; NONEON-NOSVE-NEXT: stp w10, w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = load volatile <8 x i32>, ptr %b @@ -886,12 +1521,22 @@ define void @zip2_v8i32_undef(ptr %a) #0{ ; ; NONEON-NOSVE-LABEL: zip2_v8i32_undef: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #48 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 ; NONEON-NOSVE-NEXT: ldr q0, [x0] ; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: zip2 v1.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: str q1, [x0, #16] -; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: str q0, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp] +; NONEON-NOSVE-NEXT: ldp w8, w10, [sp, #24] +; NONEON-NOSVE-NEXT: stp w9, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: stp w10, w10, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: str q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %tmp1 = load volatile <8 x i32>, ptr %a %tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -1097,15 +1742,131 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: uzp_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] -; NONEON-NOSVE-NEXT: uzp1 v4.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp2 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: uzp2 v2.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: add v0.16b, v4.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #62] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #60] +; NONEON-NOSVE-NEXT: strb w8, [sp, #95] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #61] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #58] +; NONEON-NOSVE-NEXT: strb w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #59] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #56] +; NONEON-NOSVE-NEXT: strb w8, [sp, #93] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #57] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #54] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #55] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #52] +; NONEON-NOSVE-NEXT: strb w8, [sp, #91] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #53] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #50] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #51] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #89] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #49] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #30] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #28] +; NONEON-NOSVE-NEXT: strb w8, [sp, #87] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #26] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #85] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #83] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #81] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #75] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #69] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrb w9, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <32 x i8>, ptr %a %tmp2 = load <32 x i8>, ptr %b @@ -1133,12 +1894,21 @@ define void @uzp_v4i16(ptr %a, ptr %b) #0{ ; NONEON-NOSVE-LABEL: uzp_v4i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr d0, [x0] -; NONEON-NOSVE-NEXT: ext v1.8b, v0.8b, v0.8b, #6 -; NONEON-NOSVE-NEXT: ext v2.8b, v0.8b, v0.8b, #2 -; NONEON-NOSVE-NEXT: trn1 v1.4h, v0.4h, v1.4h -; NONEON-NOSVE-NEXT: zip1 v0.4h, v2.4h, v0.4h -; NONEON-NOSVE-NEXT: add v0.4h, v1.4h, v0.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #6] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: add w9, w9, w8 +; NONEON-NOSVE-NEXT: strh w9, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x i16>, ptr %a %tmp2 = load <4 x i16>, ptr %b @@ -1260,15 +2030,79 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: uzp_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] -; NONEON-NOSVE-NEXT: uzp1 v4.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp2 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp2 v2.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: add v0.8h, v4.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v2.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #60] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #58] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #52] +; NONEON-NOSVE-NEXT: strh w8, [sp, #92] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #54] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #50] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #88] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #84] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #44] +; NONEON-NOSVE-NEXT: strh w8, [sp, #80] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #42] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #36] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #38] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #34] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <16 x i16>, ptr %a %tmp2 = load <16 x i16>, ptr %b @@ -1312,15 +2146,31 @@ define void @uzp_v8f32(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: uzp_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] -; NONEON-NOSVE-NEXT: uzp1 v4.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp2 v2.4s, v3.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v0.4s, v4.4s, v0.4s -; NONEON-NOSVE-NEXT: fadd v1.4s, v1.4s, v2.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp q2, q1, [x0] +; NONEON-NOSVE-NEXT: mov x8, #9205357640488583168 // =0x7fc000007fc00000 +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: str x8, [sp, #56] +; NONEON-NOSVE-NEXT: mov w8, #2143289344 // =0x7fc00000 +; NONEON-NOSVE-NEXT: str w8, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #24] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: str w8, [sp, #68] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: str s0, [sp, #52] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #40] +; NONEON-NOSVE-NEXT: fadd s2, s1, s0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #32] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: stp s0, s2, [sp, #72] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: fadd s0, s1, s0 +; NONEON-NOSVE-NEXT: str s0, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x float>, ptr %a %tmp2 = load <8 x float>, ptr %b @@ -1347,15 +2197,27 @@ define void @uzp_v4i64(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: uzp_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] -; NONEON-NOSVE-NEXT: zip1 v4.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: zip2 v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: zip1 v1.2d, v3.2d, v2.2d -; NONEON-NOSVE-NEXT: zip2 v2.2d, v3.2d, v2.2d -; NONEON-NOSVE-NEXT: add v0.2d, v4.2d, v0.2d -; NONEON-NOSVE-NEXT: add v1.2d, v1.2d, v2.2d +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #48] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #88] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #16] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #80] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp, #32] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #72] +; NONEON-NOSVE-NEXT: ldp x9, x8, [sp] +; NONEON-NOSVE-NEXT: add x8, x9, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #64] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x i64>, ptr %a %tmp2 = load <4 x i64>, ptr %b @@ -1427,12 +2289,45 @@ define void @uzp_v8i16(ptr %a, ptr %b) #0{ ; ; NONEON-NOSVE-LABEL: uzp_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1] -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: uzp2 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: add v0.8h, v2.8h, v0.8h +; NONEON-NOSVE-NEXT: ldr q0, [x1] +; NONEON-NOSVE-NEXT: ldr q1, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #28] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: ldrh w9, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: str q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i16>, ptr %a %tmp2 = load <8 x i16>, ptr %b @@ -1476,10 +2371,23 @@ define void @uzp_v8i32_undef(ptr %a) #0{ ; NONEON-NOSVE-LABEL: uzp_v8i32_undef: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp2 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: add v0.4s, v2.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #24] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: add w8, w9, w8 +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <8 x i32>, ptr %a %tmp3 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> @@ -1507,15 +2415,28 @@ define void @zip_vscale2_4(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: zip_vscale2_4: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x1] -; NONEON-NOSVE-NEXT: zip1 v4.2d, v1.2d, v3.2d -; NONEON-NOSVE-NEXT: zip1 v5.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: zip2 v1.2d, v1.2d, v3.2d -; NONEON-NOSVE-NEXT: zip2 v0.2d, v0.2d, v2.2d -; NONEON-NOSVE-NEXT: fadd v2.2d, v4.2d, v5.2d -; NONEON-NOSVE-NEXT: fadd v0.2d, v1.2d, v0.2d -; NONEON-NOSVE-NEXT: stp q2, q0, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 96 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp] +; NONEON-NOSVE-NEXT: stp q3, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #56] +; NONEON-NOSVE-NEXT: fadd d2, d1, d0 +; NONEON-NOSVE-NEXT: ldp d3, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: fadd d0, d3, d0 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #48] +; NONEON-NOSVE-NEXT: fadd d2, d1, d0 +; NONEON-NOSVE-NEXT: ldr d0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d1, [sp] +; NONEON-NOSVE-NEXT: fadd d0, d1, d0 +; NONEON-NOSVE-NEXT: stp d0, d2, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %tmp1 = load <4 x double>, ptr %a %tmp2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll index 41d2cb8a2c7564..8ebf713a671f49 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll @@ -39,19 +39,76 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ptest_v16i1: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b -; NONEON-NOSVE-NEXT: umaxv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: mov w8, #255 // =0xff +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: stp q1, q2, [sp] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #32] +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #40] +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w9, w8, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp] +; NONEON-NOSVE-NEXT: csetm w10, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csetm w11, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #8] +; NONEON-NOSVE-NEXT: csinv w11, w11, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csinv w11, w11, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #32] +; NONEON-NOSVE-NEXT: csinv w11, w11, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csinv w11, w11, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #48] +; NONEON-NOSVE-NEXT: csinv w11, w11, wzr, eq +; NONEON-NOSVE-NEXT: cmp w11, w10 +; NONEON-NOSVE-NEXT: csel w10, w11, w10, hi +; NONEON-NOSVE-NEXT: and w10, w10, #0xff +; NONEON-NOSVE-NEXT: cmp w10, w9 +; NONEON-NOSVE-NEXT: csel w9, w10, w9, hi +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #16] +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #24] +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w10, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w10 +; NONEON-NOSVE-NEXT: csel w9, w9, w10, hi +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: csel w8, w8, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi ; NONEON-NOSVE-NEXT: and w0, w8, #0x1 +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %v0 = bitcast ptr %a to ptr %v1 = load <16 x float>, ptr %v0, align 4 @@ -113,29 +170,144 @@ define i1 @ptest_or_v16i1(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ptest_or_v16i1: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x1, #32] -; NONEON-NOSVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 -; NONEON-NOSVE-NEXT: ldp q6, q7, [x1] -; NONEON-NOSVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: fcmeq v7.4s, v7.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v6.8h, v7.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b -; NONEON-NOSVE-NEXT: orn v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: umaxv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #48] +; NONEON-NOSVE-NEXT: str q2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #52] +; NONEON-NOSVE-NEXT: ldr q0, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: ldp s2, s0, [sp, #96] +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: csinv w8, w8, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: ldr s2, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w9, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #24] +; NONEON-NOSVE-NEXT: csinv w9, w9, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w10, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csinv w10, w10, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: csetm w11, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: csinv w11, w11, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s2, s0, [sp] +; NONEON-NOSVE-NEXT: orr w10, w11, w10 +; NONEON-NOSVE-NEXT: csetm w12, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csinv w12, w12, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: csetm w13, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csinv w14, w13, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #56] +; NONEON-NOSVE-NEXT: orr w12, w14, w12 +; NONEON-NOSVE-NEXT: orr w10, w12, w10 +; NONEON-NOSVE-NEXT: csetm w13, ne +; NONEON-NOSVE-NEXT: orr w9, w10, w9 +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr q0, [x1, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: csinv w13, w13, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldr s1, [sp, #64] +; NONEON-NOSVE-NEXT: csetm w15, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #112] +; NONEON-NOSVE-NEXT: csinv w15, w15, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: and w11, w15, #0xff +; NONEON-NOSVE-NEXT: csetm w16, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #68] +; NONEON-NOSVE-NEXT: csinv w16, w16, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csetm w17, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #120] +; NONEON-NOSVE-NEXT: csinv w17, w17, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #76] +; NONEON-NOSVE-NEXT: csetm w18, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldr q1, [x1, #48] +; NONEON-NOSVE-NEXT: str q1, [sp, #80] +; NONEON-NOSVE-NEXT: csinv w18, w18, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: ldr s2, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w0, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: csinv w0, w0, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csetm w1, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csinv w1, w1, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #36] +; NONEON-NOSVE-NEXT: csetm w2, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #88] +; NONEON-NOSVE-NEXT: csinv w2, w2, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: csetm w3, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: csinv w3, w3, wzr, eq +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csetm w4, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csinv w10, w4, wzr, eq +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, hi +; NONEON-NOSVE-NEXT: and w9, w13, #0xff +; NONEON-NOSVE-NEXT: and w10, w10, #0xff +; NONEON-NOSVE-NEXT: and w8, w8, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: and w9, w16, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, hi +; NONEON-NOSVE-NEXT: and w11, w17, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: and w9, w18, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, hi +; NONEON-NOSVE-NEXT: and w11, w0, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: and w9, w1, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, hi +; NONEON-NOSVE-NEXT: and w11, w2, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: and w9, w3, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, hi +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, hi +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, hi ; NONEON-NOSVE-NEXT: and w0, w8, #0x1 +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %v0 = bitcast ptr %a to ptr %v1 = load <16 x float>, ptr %v0, align 4 @@ -207,29 +379,144 @@ define i1 @ptest_and_v16i1(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: ptest_and_v16i1: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x1, #32] -; NONEON-NOSVE-NEXT: fcmeq v1.4s, v1.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v0.4s, v0.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v3.4s, v3.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v2.4s, v2.4s, #0.0 -; NONEON-NOSVE-NEXT: ldp q6, q7, [x1] -; NONEON-NOSVE-NEXT: fcmeq v4.4s, v4.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v5.4s, v5.4s, #0.0 -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; NONEON-NOSVE-NEXT: fcmeq v7.4s, v7.4s, #0.0 -; NONEON-NOSVE-NEXT: fcmeq v6.4s, v6.4s, #0.0 -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v2.8h, v3.8h -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v6.8h, v7.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: mvn v0.16b, v0.16b -; NONEON-NOSVE-NEXT: bic v0.16b, v0.16b, v1.16b -; NONEON-NOSVE-NEXT: uminv b0, v0.16b -; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: str q1, [sp] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #48] +; NONEON-NOSVE-NEXT: str q2, [sp, #32] +; NONEON-NOSVE-NEXT: ldr s1, [sp, #52] +; NONEON-NOSVE-NEXT: ldr q0, [x1, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #96] +; NONEON-NOSVE-NEXT: ldp s2, s0, [sp, #96] +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] +; NONEON-NOSVE-NEXT: csetm w8, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldr q1, [x1] +; NONEON-NOSVE-NEXT: str q1, [sp, #16] +; NONEON-NOSVE-NEXT: csel w8, w8, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: ldr s2, [sp, #12] +; NONEON-NOSVE-NEXT: csetm w9, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #24] +; NONEON-NOSVE-NEXT: csel w9, w9, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: csetm w10, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csel w10, w10, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: csetm w11, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s1, s0, [sp, #16] +; NONEON-NOSVE-NEXT: csel w11, w11, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s2, s0, [sp] +; NONEON-NOSVE-NEXT: and w10, w11, w10 +; NONEON-NOSVE-NEXT: csetm w12, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w12, w12, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #104] +; NONEON-NOSVE-NEXT: csetm w13, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csel w14, w13, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #56] +; NONEON-NOSVE-NEXT: and w12, w14, w12 +; NONEON-NOSVE-NEXT: and w10, w12, w10 +; NONEON-NOSVE-NEXT: csetm w13, ne +; NONEON-NOSVE-NEXT: and w9, w10, w9 +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr q0, [x1, #32] +; NONEON-NOSVE-NEXT: str q0, [sp, #112] +; NONEON-NOSVE-NEXT: csel w13, w13, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldr s1, [sp, #64] +; NONEON-NOSVE-NEXT: csetm w15, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #112] +; NONEON-NOSVE-NEXT: csel w15, w15, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: and w11, w15, #0xff +; NONEON-NOSVE-NEXT: csetm w16, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #68] +; NONEON-NOSVE-NEXT: csel w16, w16, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csetm w17, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #120] +; NONEON-NOSVE-NEXT: csel w17, w17, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #76] +; NONEON-NOSVE-NEXT: csetm w18, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldr q1, [x1, #48] +; NONEON-NOSVE-NEXT: str q1, [sp, #80] +; NONEON-NOSVE-NEXT: csel w18, w18, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: ldr s2, [sp, #32] +; NONEON-NOSVE-NEXT: csetm w0, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #80] +; NONEON-NOSVE-NEXT: csel w0, w0, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csetm w1, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csel w1, w1, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #36] +; NONEON-NOSVE-NEXT: csetm w2, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldp s0, s2, [sp, #88] +; NONEON-NOSVE-NEXT: csel w2, w2, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: csetm w3, ne +; NONEON-NOSVE-NEXT: fcmp s1, #0.0 +; NONEON-NOSVE-NEXT: csel w3, w3, wzr, ne +; NONEON-NOSVE-NEXT: fcmp s2, #0.0 +; NONEON-NOSVE-NEXT: csetm w4, ne +; NONEON-NOSVE-NEXT: fcmp s0, #0.0 +; NONEON-NOSVE-NEXT: csel w10, w4, wzr, ne +; NONEON-NOSVE-NEXT: cmp w9, w8 +; NONEON-NOSVE-NEXT: csel w8, w9, w8, lo +; NONEON-NOSVE-NEXT: and w9, w13, #0xff +; NONEON-NOSVE-NEXT: and w10, w10, #0xff +; NONEON-NOSVE-NEXT: and w8, w8, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: and w9, w16, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, lo +; NONEON-NOSVE-NEXT: and w11, w17, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: and w9, w18, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, lo +; NONEON-NOSVE-NEXT: and w11, w0, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: and w9, w1, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, lo +; NONEON-NOSVE-NEXT: and w11, w2, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: and w9, w3, #0xff +; NONEON-NOSVE-NEXT: cmp w8, w11 +; NONEON-NOSVE-NEXT: csel w8, w8, w11, lo +; NONEON-NOSVE-NEXT: cmp w8, w9 +; NONEON-NOSVE-NEXT: csel w8, w8, w9, lo +; NONEON-NOSVE-NEXT: cmp w8, w10 +; NONEON-NOSVE-NEXT: csel w8, w8, w10, lo ; NONEON-NOSVE-NEXT: and w0, w8, #0x1 +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %v0 = bitcast ptr %a to ptr %v1 = load <16 x float>, ptr %v0, align 4 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll index 5626f77c684f22..bc0fc7c79391d1 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll @@ -22,9 +22,26 @@ define <4 x i8> @bitreverse_v4i8(<4 x i8> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev16 v0.8b, v0.8b -; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b -; NONEON-NOSVE-NEXT: ushr v0.4h, v0.4h, #8 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i8> @llvm.bitreverse.v4i8(<4 x i8> %op) ret <4 x i8> %res @@ -41,7 +58,42 @@ define <8 x i8> @bitreverse_v8i8(<8 x i8> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i8> @llvm.bitreverse.v8i8(<8 x i8> %op) ret <8 x i8> %res @@ -58,7 +110,74 @@ define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %op) ret <16 x i8> %res @@ -76,10 +195,140 @@ define void @bitreverse_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: bitreverse_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #31] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #29] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #28] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #27] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #26] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #25] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #24] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #23] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #21] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #20] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #19] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #18] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #17] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #16] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <32 x i8>, ptr %a %res = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %op) @@ -99,9 +348,17 @@ define <2 x i16> @bitreverse_v2i16(<2 x i16> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev32 v0.8b, v0.8b -; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b -; NONEON-NOSVE-NEXT: ushr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w9, w8, #16 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %op) ret <2 x i16> %res @@ -118,8 +375,26 @@ define <4 x i16> @bitreverse_v4i16(<4 x i16> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev16 v0.8b, v0.8b -; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.bitreverse.v4i16(<4 x i16> %op) ret <4 x i16> %res @@ -136,8 +411,42 @@ define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %op) ret <8 x i16> %res @@ -155,12 +464,76 @@ define void @bitreverse_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: bitreverse_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev16 v1.16b, v1.16b -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %op) @@ -179,8 +552,15 @@ define <2 x i32> @bitreverse_v2i32(<2 x i32> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev32 v0.8b, v0.8b -; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %op) ret <2 x i32> %res @@ -197,8 +577,20 @@ define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %op) ret <4 x i32> %res @@ -216,12 +608,32 @@ define void @bitreverse_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: bitreverse_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev32 v1.16b, v1.16b -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: rbit w9, w8 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: rbit w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %op) @@ -240,8 +652,13 @@ define <1 x i64> @bitreverse_v1i64(<1 x i64> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev64 v0.8b, v0.8b -; NONEON-NOSVE-NEXT: rbit v0.8b, v0.8b +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.bitreverse.v1i64(<1 x i64> %op) ret <1 x i64> %res @@ -258,8 +675,15 @@ define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) { ; ; NONEON-NOSVE-LABEL: bitreverse_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %op) ret <2 x i64> %res @@ -277,12 +701,22 @@ define void @bitreverse_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: bitreverse_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev64 v1.16b, v1.16b -; NONEON-NOSVE-NEXT: rbit v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rbit v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: rbit x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: rbit x9, x8 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: rbit x8, x8 +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %op) @@ -306,8 +740,31 @@ define <2 x i16> @bswap_v2i16(<2 x i16> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev32 v0.8b, v0.8b -; NONEON-NOSVE-NEXT: ushr v0.2s, v0.2s, #16 +; NONEON-NOSVE-NEXT: str d0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: str d0, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #22] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %op) ret <2 x i16> %res @@ -324,7 +781,26 @@ define <4 x i16> @bswap_v4i16(<4 x i16> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev16 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %op) ret <4 x i16> %res @@ -341,7 +817,42 @@ define <8 x i16> @bswap_v8i16(<8 x i16> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %op) ret <8 x i16> %res @@ -359,10 +870,79 @@ define void @bswap_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: bswap_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev16 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev16 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <16 x i16>, ptr %a %res = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %op) @@ -381,7 +961,26 @@ define <2 x i32> @bswap_v2i32(<2 x i32> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev32 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %op) ret <2 x i32> %res @@ -398,7 +997,42 @@ define <4 x i32> @bswap_v4i32(<4 x i32> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %op) ret <4 x i32> %res @@ -416,10 +1050,79 @@ define void @bswap_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: bswap_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev32 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev32 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <8 x i32>, ptr %a %res = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %op) @@ -438,7 +1141,26 @@ define <1 x i64> @bswap_v1i64(<1 x i64> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev64 v0.8b, v0.8b +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = call <1 x i64> @llvm.bswap.v1i64(<1 x i64> %op) ret <1 x i64> %res @@ -455,7 +1177,42 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) { ; ; NONEON-NOSVE-LABEL: bswap_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %op) ret <2 x i64> %res @@ -473,10 +1230,79 @@ define void @bswap_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: bswap_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: rev64 v0.16b, v0.16b -; NONEON-NOSVE-NEXT: rev64 v1.16b, v1.16b +; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr q0, [x0] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #9] +; NONEON-NOSVE-NEXT: str q0, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #11] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrb w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #1] +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #3] +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #5] +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #7] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #44] +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #33] +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #35] +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #36] +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #37] +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #39] +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op = load <4 x i64>, ptr %a %res = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %op) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll index 55f4f5bae641e5..df019ce2e0ad67 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll @@ -18,15 +18,38 @@ define <4 x i8> @sdiv_v4i8(<4 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v1.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: movi d2, #0xff00ff00ff00ff -; NONEON-NOSVE-NEXT: sshr v1.4h, v1.4h, #8 -; NONEON-NOSVE-NEXT: ushr v1.4h, v1.4h, #7 -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: usra v0.4h, v1.4h, #3 -; NONEON-NOSVE-NEXT: shl v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #8 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #5 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #4] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #2] +; NONEON-NOSVE-NEXT: ldrh w12, [sp] +; NONEON-NOSVE-NEXT: sxtb w11, w8 +; NONEON-NOSVE-NEXT: sxtb w13, w9 +; NONEON-NOSVE-NEXT: sxtb w14, w10 +; NONEON-NOSVE-NEXT: sxtb w15, w12 +; NONEON-NOSVE-NEXT: ubfx w11, w11, #10, #5 +; NONEON-NOSVE-NEXT: ubfx w13, w13, #10, #5 +; NONEON-NOSVE-NEXT: ubfx w14, w14, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w11 +; NONEON-NOSVE-NEXT: ubfx w11, w15, #10, #5 +; NONEON-NOSVE-NEXT: add w9, w9, w13 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: add w10, w10, w14 +; NONEON-NOSVE-NEXT: sxtb w9, w9 +; NONEON-NOSVE-NEXT: add w11, w12, w11 +; NONEON-NOSVE-NEXT: sxtb w10, w10 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: sxtb w11, w11 +; NONEON-NOSVE-NEXT: lsr w9, w9, #5 +; NONEON-NOSVE-NEXT: lsr w10, w10, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: lsr w8, w11, #5 +; NONEON-NOSVE-NEXT: strh w9, [sp, #12] +; NONEON-NOSVE-NEXT: strh w10, [sp, #10] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i8> %op1, shufflevector (<4 x i8> insertelement (<4 x i8> poison, i8 32, i32 0), <4 x i8> poison, <4 x i32> zeroinitializer) ret <4 x i8> %res @@ -43,9 +66,58 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.8b, v0.8b, #0 -; NONEON-NOSVE-NEXT: usra v0.8b, v1.8b, #3 -; NONEON-NOSVE-NEXT: sshr v0.8b, v0.8b, #5 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <8 x i8> %op1, shufflevector (<8 x i8> insertelement (<8 x i8> poison, i8 32, i32 0), <8 x i8> poison, <8 x i32> zeroinitializer) ret <8 x i8> %res @@ -62,9 +134,106 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: usra v0.16b, v1.16b, #3 -; NONEON-NOSVE-NEXT: sshr v0.16b, v0.16b, #5 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <16 x i8> %op1, shufflevector (<16 x i8> insertelement (<16 x i8> poison, i8 32, i32 0), <16 x i8> poison, <16 x i32> zeroinitializer) ret <16 x i8> %res @@ -82,14 +251,204 @@ define void @sdiv_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: sdiv_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v2.16b, v0.16b, #0 -; NONEON-NOSVE-NEXT: cmlt v3.16b, v1.16b, #0 -; NONEON-NOSVE-NEXT: usra v0.16b, v2.16b, #3 -; NONEON-NOSVE-NEXT: usra v1.16b, v3.16b, #3 -; NONEON-NOSVE-NEXT: sshr v0.16b, v0.16b, #5 -; NONEON-NOSVE-NEXT: sshr v1.16b, v1.16b, #5 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #63] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #61] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #59] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #57] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #23] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #55] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #22] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #21] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #53] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #20] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #19] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #51] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #18] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #17] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #49] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #15] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #14] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #13] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #12] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #11] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #10] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #9] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #8] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #7] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #5] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #3] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp, #1] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrsb w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #10, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxtb w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %res = sdiv <32 x i8> %op1, shufflevector (<32 x i8> insertelement (<32 x i8> poison, i8 32, i32 0), <32 x i8> poison, <32 x i32> zeroinitializer) @@ -109,16 +468,20 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: shl v1.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: mov w8, #31 // =0x1f -; NONEON-NOSVE-NEXT: dup v2.2s, w8 -; NONEON-NOSVE-NEXT: sshr v1.2s, v1.2s, #16 -; NONEON-NOSVE-NEXT: ushr v1.2s, v1.2s, #26 -; NONEON-NOSVE-NEXT: and v1.8b, v1.8b, v2.8b -; NONEON-NOSVE-NEXT: add v0.2s, v0.2s, v1.2s -; NONEON-NOSVE-NEXT: shl v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #16 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #5 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: sxth w10, w8 +; NONEON-NOSVE-NEXT: sxth w11, w9 +; NONEON-NOSVE-NEXT: ubfx w10, w10, #26, #5 +; NONEON-NOSVE-NEXT: ubfx w11, w11, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w10 +; NONEON-NOSVE-NEXT: add w9, w9, w11 +; NONEON-NOSVE-NEXT: sbfx w8, w8, #5, #11 +; NONEON-NOSVE-NEXT: sbfx w9, w9, #5, #11 +; NONEON-NOSVE-NEXT: stp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i16> %op1, shufflevector (<2 x i16> insertelement (<2 x i16> poison, i16 32, i32 0), <2 x i16> poison, <2 x i32> zeroinitializer) ret <2 x i16> %res @@ -135,9 +498,34 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.4h, v0.4h, #0 -; NONEON-NOSVE-NEXT: usra v0.4h, v1.4h, #11 -; NONEON-NOSVE-NEXT: sshr v0.4h, v0.4h, #5 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i16> %op1, shufflevector (<4 x i16> insertelement (<4 x i16> poison, i16 32, i32 0), <4 x i16> poison, <4 x i32> zeroinitializer) ret <4 x i16> %res @@ -154,9 +542,58 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.8h, v0.8h, #0 -; NONEON-NOSVE-NEXT: usra v0.8h, v1.8h, #11 -; NONEON-NOSVE-NEXT: sshr v0.8h, v0.8h, #5 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <8 x i16> %op1, shufflevector (<8 x i16> insertelement (<8 x i16> poison, i16 32, i32 0), <8 x i16> poison, <8 x i32> zeroinitializer) ret <8 x i16> %res @@ -174,14 +611,108 @@ define void @sdiv_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: sdiv_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v2.8h, v0.8h, #0 -; NONEON-NOSVE-NEXT: cmlt v3.8h, v1.8h, #0 -; NONEON-NOSVE-NEXT: usra v0.8h, v2.8h, #11 -; NONEON-NOSVE-NEXT: usra v1.8h, v3.8h, #11 -; NONEON-NOSVE-NEXT: sshr v0.8h, v0.8h, #5 -; NONEON-NOSVE-NEXT: sshr v1.8h, v1.8h, #5 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #26] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #58] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #22] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #54] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #20] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #52] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #18] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #50] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #48] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #14] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #12] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #6] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #4] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp, #2] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrsh w8, [sp] +; NONEON-NOSVE-NEXT: ubfx w9, w8, #26, #5 +; NONEON-NOSVE-NEXT: add w8, w8, w9 +; NONEON-NOSVE-NEXT: sxth w8, w8 +; NONEON-NOSVE-NEXT: lsr w8, w8, #5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %res = sdiv <16 x i16> %op1, shufflevector (<16 x i16> insertelement (<16 x i16> poison, i16 32, i32 0), <16 x i16> poison, <16 x i32> zeroinitializer) @@ -200,9 +731,19 @@ define <2 x i32> @sdiv_v2i32(<2 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.2s, v0.2s, #0 -; NONEON-NOSVE-NEXT: usra v0.2s, v1.2s, #27 -; NONEON-NOSVE-NEXT: sshr v0.2s, v0.2s, #5 +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i32> %op1, shufflevector (<2 x i32> insertelement (<2 x i32> poison, i32 32, i32 0), <2 x i32> poison, <2 x i32> zeroinitializer) ret <2 x i32> %res @@ -219,9 +760,28 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.4s, v0.4s, #0 -; NONEON-NOSVE-NEXT: usra v0.4s, v1.4s, #27 -; NONEON-NOSVE-NEXT: sshr v0.4s, v0.4s, #5 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <4 x i32> %op1, shufflevector (<4 x i32> insertelement (<4 x i32> poison, i32 32, i32 0), <4 x i32> poison, <4 x i32> zeroinitializer) ret <4 x i32> %res @@ -239,14 +799,48 @@ define void @sdiv_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: sdiv_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v2.4s, v0.4s, #0 -; NONEON-NOSVE-NEXT: cmlt v3.4s, v1.4s, #0 -; NONEON-NOSVE-NEXT: usra v0.4s, v2.4s, #27 -; NONEON-NOSVE-NEXT: usra v1.4s, v3.4s, #27 -; NONEON-NOSVE-NEXT: sshr v0.4s, v0.4s, #5 -; NONEON-NOSVE-NEXT: sshr v1.4s, v1.4s, #5 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w10, w8, #5 +; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: asr w9, w8, #31 +; NONEON-NOSVE-NEXT: add w8, w8, w9, lsr #27 +; NONEON-NOSVE-NEXT: asr w8, w8, #5 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %res = sdiv <8 x i32> %op1, shufflevector (<8 x i32> insertelement (<8 x i32> poison, i32 32, i32 0), <8 x i32> poison, <8 x i32> zeroinitializer) @@ -265,9 +859,15 @@ define <1 x i64> @sdiv_v1i64(<1 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt d1, d0, #0 -; NONEON-NOSVE-NEXT: usra d0, d1, #59 -; NONEON-NOSVE-NEXT: sshr d0, d0, #5 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: fmov x8, d0 +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x8, x8, #5 +; NONEON-NOSVE-NEXT: str x8, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %res = sdiv <1 x i64> %op1, shufflevector (<1 x i64> insertelement (<1 x i64> poison, i64 32, i32 0), <1 x i64> poison, <1 x i32> zeroinitializer) ret <1 x i64> %res @@ -285,9 +885,19 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) { ; ; NONEON-NOSVE-LABEL: sdiv_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: cmlt v1.2d, v0.2d, #0 -; NONEON-NOSVE-NEXT: usra v0.2d, v1.2d, #59 -; NONEON-NOSVE-NEXT: sshr v0.2d, v0.2d, #5 +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x10, x8, #5 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x8, x8, #5 +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %res = sdiv <2 x i64> %op1, shufflevector (<2 x i64> insertelement (<2 x i64> poison, i64 32, i32 0), <2 x i64> poison, <2 x i32> zeroinitializer) ret <2 x i64> %res @@ -305,14 +915,30 @@ define void @sdiv_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: sdiv_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q1, [x0] -; NONEON-NOSVE-NEXT: cmlt v2.2d, v0.2d, #0 -; NONEON-NOSVE-NEXT: cmlt v3.2d, v1.2d, #0 -; NONEON-NOSVE-NEXT: usra v0.2d, v2.2d, #59 -; NONEON-NOSVE-NEXT: usra v1.2d, v3.2d, #59 -; NONEON-NOSVE-NEXT: sshr v0.2d, v0.2d, #5 -; NONEON-NOSVE-NEXT: sshr v1.2d, v1.2d, #5 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-64]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x10, x8, #5 +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x8, x8, #5 +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x10, x8, #5 +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: asr x9, x8, #63 +; NONEON-NOSVE-NEXT: add x8, x8, x9, lsr #59 +; NONEON-NOSVE-NEXT: asr x8, x8, #5 +; NONEON-NOSVE-NEXT: stp x8, x10, [sp, #32] +; NONEON-NOSVE-NEXT: ldp q0, q1, [sp, #32] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %res = sdiv <4 x i64> %op1, shufflevector (<4 x i64> insertelement (<4 x i64> poison, i64 32, i32 0), <4 x i64> poison, <4 x i32> zeroinitializer) diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll index 38aaf860b7298c..b66e6d90135730 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll @@ -18,9 +18,15 @@ define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) { ; ; NONEON-NOSVE-LABEL: hang_when_merging_stores_after_legalisation: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.4s, v0.s[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <8 x i32> zeroinitializer %interleaved.vec = shufflevector <8 x i32> %splat, <8 x i32> undef, <8 x i32> @@ -39,9 +45,25 @@ define void @interleave_store_without_splat(ptr %a, <4 x i32> %v1, <4 x i32> %v2 ; ; NONEON-NOSVE-LABEL: interleave_store_without_splat: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip2 v2.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q2, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #64 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 64 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #48] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #64 ; NONEON-NOSVE-NEXT: ret %shuffle = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> %interleaved = shufflevector <8 x i32> %shuffle, <8 x i32> undef, <8 x i32> @@ -64,12 +86,40 @@ define void @interleave_store_legalization(ptr %a, <8 x i32> %v1, <8 x i32> %v2) ; ; NONEON-NOSVE-LABEL: interleave_store_legalization: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: zip2 v4.4s, v1.4s, v3.4s -; NONEON-NOSVE-NEXT: zip1 v1.4s, v1.4s, v3.4s -; NONEON-NOSVE-NEXT: zip2 v3.4s, v0.4s, v2.4s -; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v2.4s -; NONEON-NOSVE-NEXT: stp q1, q4, [x0, #32] -; NONEON-NOSVE-NEXT: stp q0, q3, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #128 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 128 +; NONEON-NOSVE-NEXT: stp q1, q3, [sp, #16] +; NONEON-NOSVE-NEXT: stp q0, q2, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #100] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #84] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #108] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #92] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #104] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr q3, [sp, #112] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #44] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr q1, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: ldp q0, q2, [sp, #48] +; NONEON-NOSVE-NEXT: stp q2, q3, [x0] +; NONEON-NOSVE-NEXT: stp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #128 ; NONEON-NOSVE-NEXT: ret %interleaved.vec = shufflevector <8 x i32> %v1, <8 x i32> %v2, <16 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll index e15529e1926ac7..a4cf5d608fed6d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll @@ -19,7 +19,14 @@ define <4 x i8> @splat_v4i8(i8 %a) { ; ; NONEON-NOSVE-LABEL: splat_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.4h, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strh w0, [sp, #14] +; NONEON-NOSVE-NEXT: strh w0, [sp, #12] +; NONEON-NOSVE-NEXT: strh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i8> undef, i8 %a, i64 0 %splat = shufflevector <4 x i8> %insert, <4 x i8> undef, <4 x i32> zeroinitializer @@ -35,7 +42,18 @@ define <8 x i8> @splat_v8i8(i8 %a) { ; ; NONEON-NOSVE-LABEL: splat_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.8b, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strb w0, [sp, #15] +; NONEON-NOSVE-NEXT: strb w0, [sp, #14] +; NONEON-NOSVE-NEXT: strb w0, [sp, #13] +; NONEON-NOSVE-NEXT: strb w0, [sp, #12] +; NONEON-NOSVE-NEXT: strb w0, [sp, #11] +; NONEON-NOSVE-NEXT: strb w0, [sp, #10] +; NONEON-NOSVE-NEXT: strb w0, [sp, #9] +; NONEON-NOSVE-NEXT: strb w0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i8> undef, i8 %a, i64 0 %splat = shufflevector <8 x i8> %insert, <8 x i8> undef, <8 x i32> zeroinitializer @@ -51,7 +69,25 @@ define <16 x i8> @splat_v16i8(i8 %a) { ; ; NONEON-NOSVE-LABEL: splat_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.16b, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strb w0, [sp, #15] +; NONEON-NOSVE-NEXT: strb w0, [sp, #14] +; NONEON-NOSVE-NEXT: strb w0, [sp, #13] +; NONEON-NOSVE-NEXT: strb w0, [sp, #12] +; NONEON-NOSVE-NEXT: strb w0, [sp, #11] +; NONEON-NOSVE-NEXT: strb w0, [sp, #10] +; NONEON-NOSVE-NEXT: strb w0, [sp, #9] +; NONEON-NOSVE-NEXT: strb w0, [sp, #8] +; NONEON-NOSVE-NEXT: strb w0, [sp, #7] +; NONEON-NOSVE-NEXT: strb w0, [sp, #6] +; NONEON-NOSVE-NEXT: strb w0, [sp, #5] +; NONEON-NOSVE-NEXT: strb w0, [sp, #4] +; NONEON-NOSVE-NEXT: strb w0, [sp, #3] +; NONEON-NOSVE-NEXT: strb w0, [sp, #2] +; NONEON-NOSVE-NEXT: strb w0, [sp, #1] +; NONEON-NOSVE-NEXT: strb w0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x i8> undef, i8 %a, i64 0 %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer @@ -67,8 +103,27 @@ define void @splat_v32i8(i8 %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.16b, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strb w0, [sp, #15] +; NONEON-NOSVE-NEXT: strb w0, [sp, #14] +; NONEON-NOSVE-NEXT: strb w0, [sp, #13] +; NONEON-NOSVE-NEXT: strb w0, [sp, #12] +; NONEON-NOSVE-NEXT: strb w0, [sp, #11] +; NONEON-NOSVE-NEXT: strb w0, [sp, #10] +; NONEON-NOSVE-NEXT: strb w0, [sp, #9] +; NONEON-NOSVE-NEXT: strb w0, [sp, #8] +; NONEON-NOSVE-NEXT: strb w0, [sp, #7] +; NONEON-NOSVE-NEXT: strb w0, [sp, #6] +; NONEON-NOSVE-NEXT: strb w0, [sp, #5] +; NONEON-NOSVE-NEXT: strb w0, [sp, #4] +; NONEON-NOSVE-NEXT: strb w0, [sp, #3] +; NONEON-NOSVE-NEXT: strb w0, [sp, #2] +; NONEON-NOSVE-NEXT: strb w0, [sp, #1] +; NONEON-NOSVE-NEXT: strb w0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <32 x i8> undef, i8 %a, i64 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer @@ -85,7 +140,11 @@ define <2 x i16> @splat_v2i16(i16 %a) { ; ; NONEON-NOSVE-LABEL: splat_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.2s, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp w0, w0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x i16> undef, i16 %a, i64 0 %splat = shufflevector <2 x i16> %insert, <2 x i16> undef, <2 x i32> zeroinitializer @@ -101,7 +160,14 @@ define <4 x i16> @splat_v4i16(i16 %a) { ; ; NONEON-NOSVE-LABEL: splat_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.4h, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strh w0, [sp, #14] +; NONEON-NOSVE-NEXT: strh w0, [sp, #12] +; NONEON-NOSVE-NEXT: strh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i16> undef, i16 %a, i64 0 %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer @@ -117,7 +183,17 @@ define <8 x i16> @splat_v8i16(i16 %a) { ; ; NONEON-NOSVE-LABEL: splat_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.8h, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strh w0, [sp, #14] +; NONEON-NOSVE-NEXT: strh w0, [sp, #12] +; NONEON-NOSVE-NEXT: strh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w0, [sp, #8] +; NONEON-NOSVE-NEXT: strh w0, [sp, #6] +; NONEON-NOSVE-NEXT: strh w0, [sp, #4] +; NONEON-NOSVE-NEXT: strh w0, [sp, #2] +; NONEON-NOSVE-NEXT: strh w0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i16> undef, i16 %a, i64 0 %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer @@ -133,8 +209,19 @@ define void @splat_v16i16(i16 %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.8h, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: strh w0, [sp, #14] +; NONEON-NOSVE-NEXT: strh w0, [sp, #12] +; NONEON-NOSVE-NEXT: strh w0, [sp, #10] +; NONEON-NOSVE-NEXT: strh w0, [sp, #8] +; NONEON-NOSVE-NEXT: strh w0, [sp, #6] +; NONEON-NOSVE-NEXT: strh w0, [sp, #4] +; NONEON-NOSVE-NEXT: strh w0, [sp, #2] +; NONEON-NOSVE-NEXT: strh w0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x i16> undef, i16 %a, i64 0 %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer @@ -151,7 +238,11 @@ define <2 x i32> @splat_v2i32(i32 %a) { ; ; NONEON-NOSVE-LABEL: splat_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.2s, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp w0, w0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x i32> undef, i32 %a, i64 0 %splat = shufflevector <2 x i32> %insert, <2 x i32> undef, <2 x i32> zeroinitializer @@ -167,7 +258,11 @@ define <4 x i32> @splat_v4i32(i32 %a) { ; ; NONEON-NOSVE-LABEL: splat_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.4s, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp w0, w0, [sp, #8] +; NONEON-NOSVE-NEXT: stp w0, w0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i32> undef, i32 %a, i64 0 %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer @@ -183,8 +278,13 @@ define void @splat_v8i32(i32 %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.4s, w0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp w0, w0, [sp, #8] +; NONEON-NOSVE-NEXT: stp w0, w0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i32> undef, i32 %a, i64 0 %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer @@ -201,7 +301,11 @@ define <1 x i64> @splat_v1i64(i64 %a) { ; ; NONEON-NOSVE-LABEL: splat_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov d0, x0 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str x0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <1 x i64> undef, i64 %a, i64 0 %splat = shufflevector <1 x i64> %insert, <1 x i64> undef, <1 x i32> zeroinitializer @@ -217,7 +321,9 @@ define <2 x i64> @splat_v2i64(i64 %a) { ; ; NONEON-NOSVE-LABEL: splat_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.2d, x0 +; NONEON-NOSVE-NEXT: stp x0, x0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x i64> undef, i64 %a, i64 0 %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer @@ -233,8 +339,11 @@ define void @splat_v4i64(i64 %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: dup v0.2d, x0 +; NONEON-NOSVE-NEXT: stp x0, x0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i64> undef, i64 %a, i64 0 %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer @@ -256,8 +365,12 @@ define <2 x half> @splat_v2f16(half %a) { ; ; NONEON-NOSVE-LABEL: splat_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $h0 killed $h0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.4h, v0.h[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x half> undef, half %a, i64 0 %splat = shufflevector <2 x half> %insert, <2 x half> undef, <2 x i32> zeroinitializer @@ -274,8 +387,14 @@ define <4 x half> @splat_v4f16(half %a) { ; ; NONEON-NOSVE-LABEL: splat_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $h0 killed $h0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.4h, v0.h[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x half> undef, half %a, i64 0 %splat = shufflevector <4 x half> %insert, <4 x half> undef, <4 x i32> zeroinitializer @@ -292,8 +411,17 @@ define <8 x half> @splat_v8f16(half %a) { ; ; NONEON-NOSVE-LABEL: splat_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $h0 killed $h0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.8h, v0.h[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: str h0, [sp, #6] +; NONEON-NOSVE-NEXT: str h0, [sp, #4] +; NONEON-NOSVE-NEXT: str h0, [sp, #2] +; NONEON-NOSVE-NEXT: str h0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x half> undef, half %a, i64 0 %splat = shufflevector <8 x half> %insert, <8 x half> undef, <8 x i32> zeroinitializer @@ -310,9 +438,19 @@ define void @splat_v16f16(half %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $h0 killed $h0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.8h, v0.h[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str h0, [sp, #14] +; NONEON-NOSVE-NEXT: str h0, [sp, #12] +; NONEON-NOSVE-NEXT: str h0, [sp, #10] +; NONEON-NOSVE-NEXT: str h0, [sp, #8] +; NONEON-NOSVE-NEXT: str h0, [sp, #6] +; NONEON-NOSVE-NEXT: str h0, [sp, #4] +; NONEON-NOSVE-NEXT: str h0, [sp, #2] +; NONEON-NOSVE-NEXT: str h0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x half> undef, half %a, i64 0 %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer @@ -330,8 +468,11 @@ define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) { ; ; NONEON-NOSVE-LABEL: splat_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $s0 killed $s0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.2s, v0.s[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp s0, s0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x float> undef, float %a, i64 0 %splat = shufflevector <2 x float> %insert, <2 x float> undef, <2 x i32> zeroinitializer @@ -348,8 +489,11 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) { ; ; NONEON-NOSVE-LABEL: splat_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $s0 killed $s0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.4s, v0.s[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp s0, s0, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x float> undef, float %a, i64 0 %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer @@ -366,9 +510,13 @@ define void @splat_v8f32(float %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $s0 killed $s0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.4s, v0.s[0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: stp s0, s0, [sp, #8] +; NONEON-NOSVE-NEXT: stp s0, s0, [sp] +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x float> undef, float %a, i64 0 %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer @@ -383,6 +531,11 @@ define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) { ; ; NONEON-NOSVE-LABEL: splat_v1f64: ; NONEON-NOSVE: // %bb.0: +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <1 x double> undef, double %a, i64 0 %splat = shufflevector <1 x double> %insert, <1 x double> undef, <1 x i32> zeroinitializer @@ -399,8 +552,9 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) { ; ; NONEON-NOSVE-LABEL: splat_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.2d, v0.d[0] +; NONEON-NOSVE-NEXT: stp d0, d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp], #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <2 x double> undef, double %a, i64 0 %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer @@ -417,9 +571,11 @@ define void @splat_v4f64(double %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: splat_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NONEON-NOSVE-NEXT: dup v0.2d, v0.d[0] +; NONEON-NOSVE-NEXT: stp d0, d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldr q0, [sp] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x double> undef, double %a, i64 0 %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer @@ -440,7 +596,8 @@ define void @splat_imm_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.16b, #1 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI24_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI24_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <32 x i8> undef, i8 1, i64 0 @@ -458,8 +615,8 @@ define void @splat_imm_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #2 // =0x2 -; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI25_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI25_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x i16> undef, i16 2, i64 0 @@ -477,8 +634,8 @@ define void @splat_imm_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #3 // =0x3 -; NONEON-NOSVE-NEXT: dup v0.4s, w8 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI26_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI26_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x i32> undef, i32 3, i64 0 @@ -496,8 +653,8 @@ define void @splat_imm_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #4 // =0x4 -; NONEON-NOSVE-NEXT: dup v0.2d, x8 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI27_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI27_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x i64> undef, i64 4, i64 0 @@ -519,8 +676,8 @@ define void @splat_imm_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: mov w8, #17664 // =0x4500 -; NONEON-NOSVE-NEXT: dup v0.8h, w8 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI28_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI28_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <16 x half> undef, half 5.0, i64 0 @@ -538,7 +695,8 @@ define void @splat_imm_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov v0.4s, #6.00000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI29_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI29_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <8 x float> undef, float 6.0, i64 0 @@ -556,7 +714,8 @@ define void @splat_imm_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: splat_imm_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: fmov v0.2d, #7.00000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI30_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI30_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret %insert = insertelement <4 x double> undef, double 7.0, i64 0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll index f055061b13bed6..a77ac7832e17cb 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll @@ -31,7 +31,8 @@ define void @store_v8i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI1_0 +; NONEON-NOSVE-NEXT: ldr d0, [x8, :lo12:.LCPI1_0] ; NONEON-NOSVE-NEXT: str d0, [x0] ; NONEON-NOSVE-NEXT: ret store <8 x i8> zeroinitializer, ptr %a @@ -47,7 +48,8 @@ define void @store_v16i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI2_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI2_0] ; NONEON-NOSVE-NEXT: str q0, [x0] ; NONEON-NOSVE-NEXT: ret store <16 x i8> zeroinitializer, ptr %a @@ -63,7 +65,8 @@ define void @store_v32i8(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI3_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <32 x i8> zeroinitializer, ptr %a @@ -96,7 +99,14 @@ define void @store_v2f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v2f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: str wzr, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI5_0 +; NONEON-NOSVE-NEXT: ldr d0, [x8, :lo12:.LCPI5_0] +; NONEON-NOSVE-NEXT: str d0, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: str w8, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret store <2 x half> zeroinitializer, ptr %a ret void @@ -111,7 +121,8 @@ define void @store_v4i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI6_0 +; NONEON-NOSVE-NEXT: ldr d0, [x8, :lo12:.LCPI6_0] ; NONEON-NOSVE-NEXT: str d0, [x0] ; NONEON-NOSVE-NEXT: ret store <4 x i16> zeroinitializer, ptr %a @@ -127,7 +138,8 @@ define void @store_v4f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI7_0 +; NONEON-NOSVE-NEXT: ldr d0, [x8, :lo12:.LCPI7_0] ; NONEON-NOSVE-NEXT: str d0, [x0] ; NONEON-NOSVE-NEXT: ret store <4 x half> zeroinitializer, ptr %a @@ -143,7 +155,8 @@ define void @store_v8i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI8_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI8_0] ; NONEON-NOSVE-NEXT: str q0, [x0] ; NONEON-NOSVE-NEXT: ret store <8 x i16> zeroinitializer, ptr %a @@ -159,7 +172,8 @@ define void @store_v8f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI9_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI9_0] ; NONEON-NOSVE-NEXT: str q0, [x0] ; NONEON-NOSVE-NEXT: ret store <8 x half> zeroinitializer, ptr %a @@ -175,7 +189,8 @@ define void @store_v16i16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI10_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI10_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <16 x i16> zeroinitializer, ptr %a @@ -191,7 +206,8 @@ define void @store_v16f16(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI11_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI11_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <16 x half> zeroinitializer, ptr %a @@ -263,7 +279,8 @@ define void @store_v8i32(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI16_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI16_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <8 x i32> zeroinitializer, ptr %a @@ -279,7 +296,8 @@ define void @store_v8f32(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI17_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI17_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <8 x float> zeroinitializer, ptr %a @@ -295,8 +313,12 @@ define void @store_v1i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v1i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str xzr, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret store <1 x i64> zeroinitializer, ptr %a ret void @@ -311,8 +333,12 @@ define void @store_v1f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v1f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi d0, #0000000000000000 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: str xzr, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] ; NONEON-NOSVE-NEXT: str d0, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret store <1 x double> zeroinitializer, ptr %a ret void @@ -355,7 +381,8 @@ define void @store_v4i64(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI22_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI22_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <4 x i64> zeroinitializer, ptr %a @@ -371,7 +398,8 @@ define void @store_v4f64(ptr %a) { ; ; NONEON-NOSVE-LABEL: store_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI23_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI23_0] ; NONEON-NOSVE-NEXT: stp q0, q0, [x0] ; NONEON-NOSVE-NEXT: ret store <4 x double> zeroinitializer, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll index 80c9ef87e9b915..a9f4d92b1e6b64 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll @@ -27,8 +27,12 @@ define void @subvector_v4i8(ptr %in, ptr %out) { ; ; NONEON-NOSVE-LABEL: subvector_v4i8: ; NONEON-NOSVE: // %bb.0: // %bb1 -; NONEON-NOSVE-NEXT: ldr w8, [x0] -; NONEON-NOSVE-NEXT: str w8, [x1] +; NONEON-NOSVE-NEXT: ldrh w8, [x0, #2] +; NONEON-NOSVE-NEXT: ldrb w9, [x0, #1] +; NONEON-NOSVE-NEXT: ldrb w10, [x0] +; NONEON-NOSVE-NEXT: strh w8, [x1, #2] +; NONEON-NOSVE-NEXT: strb w9, [x1, #1] +; NONEON-NOSVE-NEXT: strb w10, [x1] ; NONEON-NOSVE-NEXT: ret %a = load <4 x i8>, ptr %in br label %bb1 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll index 41b68e10e75ded..30682751037fe5 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll @@ -17,8 +17,27 @@ define void @store_trunc_v8i16i8(ptr %ap, ptr %dest) { ; NONEON-NOSVE-LABEL: store_trunc_v8i16i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: xtn v0.8b, v0.8h +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #27] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i16>, ptr %ap %val = trunc <8 x i16> %a to <8 x i8> @@ -37,9 +56,15 @@ define void @store_trunc_v4i32i8(ptr %ap, ptr %dest) { ; NONEON-NOSVE-LABEL: store_trunc_v4i32i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8b, v0.8b, v0.8b -; NONEON-NOSVE-NEXT: str s0, [x1] +; NONEON-NOSVE-NEXT: str q0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp] +; NONEON-NOSVE-NEXT: strb w8, [x1, #3] +; NONEON-NOSVE-NEXT: strb w9, [x1, #2] +; NONEON-NOSVE-NEXT: strb w11, [x1, #1] +; NONEON-NOSVE-NEXT: strb w10, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i32>, ptr %ap %val = trunc <4 x i32> %a to <4 x i8> @@ -58,8 +83,17 @@ define void @store_trunc_v4i32i16(ptr %ap, ptr %dest) { ; NONEON-NOSVE-LABEL: store_trunc_v4i32i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #30] +; NONEON-NOSVE-NEXT: strh w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: strh w9, [sp, #26] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i32>, ptr %ap %val = trunc <4 x i32> %a to <4 x i16> @@ -78,8 +112,13 @@ define void @store_trunc_v2i64i8(ptr %ap, ptr %dest) { ; NONEON-NOSVE-LABEL: store_trunc_v2i64i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldr q0, [x0] -; NONEON-NOSVE-NEXT: xtn v0.2s, v0.2d +; NONEON-NOSVE-NEXT: str q0, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] ; NONEON-NOSVE-NEXT: str d0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <2 x i64>, ptr %ap %val = trunc <2 x i64> %a to <2 x i32> @@ -99,10 +138,15 @@ define void @store_trunc_v2i256i64(ptr %ap, ptr %dest) { ; ; NONEON-NOSVE-LABEL: store_trunc_v2i256i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldr d0, [x0, #32] -; NONEON-NOSVE-NEXT: ldr d1, [x0] -; NONEON-NOSVE-NEXT: mov v1.d[1], v0.d[0] -; NONEON-NOSVE-NEXT: str q1, [x1] +; NONEON-NOSVE-NEXT: ldr x8, [x0, #32] +; NONEON-NOSVE-NEXT: ldr x9, [x0] +; NONEON-NOSVE-NEXT: stp x9, x8, [sp, #-32]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: str q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %a = load <2 x i256>, ptr %ap %val = trunc <2 x i256> %a to <2 x i64> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll index 8242b4e26d5057..bc046059f0bd59 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll @@ -24,7 +24,41 @@ define <16 x i8> @trunc_v16i16_v16i8(ptr %in) nounwind { ; NONEON-NOSVE-LABEL: trunc_v16i16_v16i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #30] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #26] +; NONEON-NOSVE-NEXT: strb w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #24] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #43] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #18] +; NONEON-NOSVE-NEXT: strb w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strb w8, [sp, #39] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: strb w8, [sp, #38] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #10] +; NONEON-NOSVE-NEXT: strb w8, [sp, #37] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: strb w8, [sp, #35] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4] +; NONEON-NOSVE-NEXT: strb w8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #2] +; NONEON-NOSVE-NEXT: strb w8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i16>, ptr %in %b = trunc <16 x i16> %a to <16 x i8> @@ -51,13 +85,125 @@ define void @trunc_v32i16_v32i8(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v32i16_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: sub sp, sp, #208 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #112] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #128] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #144] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #160] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #16] +; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w25, [sp, #28] +; NONEON-NOSVE-NEXT: ldrh w26, [sp, #30] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #64] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w29, [sp, #52] +; NONEON-NOSVE-NEXT: ldrh w27, [sp, #48] +; NONEON-NOSVE-NEXT: ldrh w28, [sp, #50] +; NONEON-NOSVE-NEXT: ldrh w23, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #56] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #54] +; NONEON-NOSVE-NEXT: ldrh w24, [sp, #26] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w21, [sp, #20] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldrh w22, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #92] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #44] +; NONEON-NOSVE-NEXT: strb w9, [sp, #91] +; NONEON-NOSVE-NEXT: add w9, w28, w28 +; NONEON-NOSVE-NEXT: ldrh w7, [sp, #46] +; NONEON-NOSVE-NEXT: strb w8, [sp, #90] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #40] +; NONEON-NOSVE-NEXT: strb w9, [sp, #89] +; NONEON-NOSVE-NEXT: add w9, w26, w26 +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #42] +; NONEON-NOSVE-NEXT: strb w8, [sp, #88] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #36] +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #192] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w19, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w20, [sp, #18] +; NONEON-NOSVE-NEXT: strb w9, [sp, #87] +; NONEON-NOSVE-NEXT: add w9, w24, w24 +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #38] +; NONEON-NOSVE-NEXT: strb w8, [sp, #86] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #60] +; NONEON-NOSVE-NEXT: strb w9, [sp, #85] +; NONEON-NOSVE-NEXT: add w9, w22, w22 +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #62] +; NONEON-NOSVE-NEXT: add w6, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #84] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: add w5, w13, w13 +; NONEON-NOSVE-NEXT: strb w9, [sp, #83] +; NONEON-NOSVE-NEXT: add w9, w20, w20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #82] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #32] +; NONEON-NOSVE-NEXT: strb w9, [sp, #81] +; NONEON-NOSVE-NEXT: add w9, w7, w7 +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #34] +; NONEON-NOSVE-NEXT: strb w8, [sp, #80] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #76] +; NONEON-NOSVE-NEXT: strb w9, [sp, #111] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #78] +; NONEON-NOSVE-NEXT: strb w8, [sp, #110] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #72] +; NONEON-NOSVE-NEXT: strb w9, [sp, #109] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #74] +; NONEON-NOSVE-NEXT: strb w8, [sp, #108] +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #68] +; NONEON-NOSVE-NEXT: strb w9, [sp, #107] +; NONEON-NOSVE-NEXT: add w9, w17, w17 +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #70] +; NONEON-NOSVE-NEXT: strb w8, [sp, #106] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldrh w30, [sp, #58] +; NONEON-NOSVE-NEXT: strb w9, [sp, #105] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strb w8, [sp, #104] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: strb w9, [sp, #103] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strb w8, [sp, #102] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w9, [sp, #101] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strb w8, [sp, #100] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: strb w9, [sp, #99] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #98] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w5, [sp, #95] +; NONEON-NOSVE-NEXT: add w5, w30, w30 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w6, [sp, #94] +; NONEON-NOSVE-NEXT: strb w5, [sp, #93] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #97] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #176] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #96] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #160] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #80] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #144] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #128] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #112] // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #208 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i16>, ptr %in %b = trunc <32 x i16> %a to <32 x i8> @@ -97,20 +243,276 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v64i16_v64i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: sub sp, sp, #448 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #416] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: ldp q6, q1, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v2.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: uzp1 v3.16b, v5.16b, v4.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v6.16b, v1.16b -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v2.16b, v2.16b, v2.16b -; NONEON-NOSVE-NEXT: add v3.16b, v3.16b, v3.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q3, q1, [x1] +; NONEON-NOSVE-NEXT: str x1, [sp, #152] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #432] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #400] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #96] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #224] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #238] +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #256] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #232] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #272] +; NONEON-NOSVE-NEXT: stp q5, q7, [sp, #160] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #230] +; NONEON-NOSVE-NEXT: add w21, w8, w8 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #274] +; NONEON-NOSVE-NEXT: stp q6, q0, [sp, #192] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #228] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #226] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #224] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #276] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #278] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #270] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #268] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #266] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #280] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #282] +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #264] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #262] +; NONEON-NOSVE-NEXT: ldrh w1, [sp, #260] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #284] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #286] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #258] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #256] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #254] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #208] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #210] +; NONEON-NOSVE-NEXT: ldrh w5, [sp, #252] +; NONEON-NOSVE-NEXT: ldrh w6, [sp, #250] +; NONEON-NOSVE-NEXT: ldrh w7, [sp, #248] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #212] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #214] +; NONEON-NOSVE-NEXT: ldrh w19, [sp, #246] +; NONEON-NOSVE-NEXT: ldrh w20, [sp, #244] +; NONEON-NOSVE-NEXT: ldrh w22, [sp, #242] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #216] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #218] +; NONEON-NOSVE-NEXT: ldrh w23, [sp, #240] +; NONEON-NOSVE-NEXT: ldrh w24, [sp, #174] +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #384] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #220] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #222] +; NONEON-NOSVE-NEXT: ldrh w25, [sp, #172] +; NONEON-NOSVE-NEXT: ldrh w26, [sp, #170] +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #178] +; NONEON-NOSVE-NEXT: ldrh w27, [sp, #168] +; NONEON-NOSVE-NEXT: ldrh w28, [sp, #166] +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #352] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #180] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #182] +; NONEON-NOSVE-NEXT: ldrh w29, [sp, #164] +; NONEON-NOSVE-NEXT: ldrh w30, [sp, #162] +; NONEON-NOSVE-NEXT: strb w21, [sp, #335] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #184] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #186] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #188] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #190] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #192] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #194] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #196] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #198] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #200] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #202] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #204] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #206] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #236] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w8, [sp, #20] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #234] +; NONEON-NOSVE-NEXT: strb w9, [sp, #334] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #333] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #332] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strb w8, [sp, #331] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #330] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strb w8, [sp, #329] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: strb w8, [sp, #328] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strb w8, [sp, #327] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #326] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strb w8, [sp, #325] +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: strb w8, [sp, #324] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strb w8, [sp, #323] +; NONEON-NOSVE-NEXT: add w8, w1, w1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #322] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #321] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strb w8, [sp, #320] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #319] +; NONEON-NOSVE-NEXT: add w8, w5, w5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #318] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: strb w8, [sp, #317] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #316] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: strb w8, [sp, #315] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #314] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #313] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: strb w8, [sp, #312] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #311] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: strb w8, [sp, #310] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #309] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: strb w8, [sp, #308] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: strb w8, [sp, #307] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: strb w8, [sp, #306] +; NONEON-NOSVE-NEXT: add w8, w30, w30 +; NONEON-NOSVE-NEXT: strb w8, [sp, #305] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #432] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #416] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #304] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #400] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #384] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #303] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #352] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #302] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #301] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #299] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #298] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #297] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #296] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #295] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #294] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #293] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #291] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #290] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #289] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #288] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q3, [sp, #288] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #351] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #92] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #350] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #349] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #348] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #347] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #346] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #345] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #344] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #343] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #124] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #342] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #341] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #132] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #340] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #339] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #140] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #338] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #337] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #148] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #336] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #152] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q2, q0, [sp, #320] +; NONEON-NOSVE-NEXT: stp q3, q2, [x8] +; NONEON-NOSVE-NEXT: stp q0, q1, [x8, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #448 ; NONEON-NOSVE-NEXT: ret %a = load <64 x i16>, ptr %in %b = trunc <64 x i16> %a to <64 x i8> @@ -172,34 +574,598 @@ define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v128i16_v128i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #192] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #224] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #128] -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: ldp q16, q1, [x0, #160] -; NONEON-NOSVE-NEXT: uzp1 v4.16b, v5.16b, v4.16b -; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v6.16b, v7.16b, v6.16b +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #800 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: str x1, [sp, #408] // 8-byte Folded Spill ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q18, q7, [x0, #96] -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v16.16b, v1.16b -; NONEON-NOSVE-NEXT: uzp1 v5.16b, v17.16b, v5.16b -; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v2.16b, v3.16b, v2.16b -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v4.16b, v4.16b, v4.16b -; NONEON-NOSVE-NEXT: uzp1 v7.16b, v18.16b, v7.16b -; NONEON-NOSVE-NEXT: add v3.16b, v6.16b, v6.16b -; NONEON-NOSVE-NEXT: uzp1 v6.16b, v17.16b, v16.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #96] -; NONEON-NOSVE-NEXT: add v0.16b, v5.16b, v5.16b -; NONEON-NOSVE-NEXT: add v2.16b, v2.16b, v2.16b -; NONEON-NOSVE-NEXT: add v4.16b, v7.16b, v7.16b -; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #64] -; NONEON-NOSVE-NEXT: add v1.16b, v6.16b, v6.16b -; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #32] -; NONEON-NOSVE-NEXT: stp q2, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #96] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #160] +; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #128] +; NONEON-NOSVE-NEXT: ldp q23, q22, [x0, #224] +; NONEON-NOSVE-NEXT: str q0, [sp, #592] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #606] +; NONEON-NOSVE-NEXT: str q19, [sp, #496] +; NONEON-NOSVE-NEXT: ldrh w10, [sp, #600] +; NONEON-NOSVE-NEXT: stp q18, q20, [sp, #512] +; NONEON-NOSVE-NEXT: ldrh w11, [sp, #598] +; NONEON-NOSVE-NEXT: ldrh w12, [sp, #596] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp q17, q23, [sp, #432] +; NONEON-NOSVE-NEXT: ldrh w13, [sp, #594] +; NONEON-NOSVE-NEXT: str w8, [sp, #64] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #432] +; NONEON-NOSVE-NEXT: ldrh w14, [sp, #592] +; NONEON-NOSVE-NEXT: stp q22, q16, [sp, #464] +; NONEON-NOSVE-NEXT: ldr w30, [sp, #64] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: str w8, [sp, #404] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #434] +; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #560] +; NONEON-NOSVE-NEXT: str w8, [sp, #400] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #436] +; NONEON-NOSVE-NEXT: str q5, [sp, #544] +; NONEON-NOSVE-NEXT: str w8, [sp, #396] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #438] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #608] +; NONEON-NOSVE-NEXT: str w8, [sp, #392] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #440] +; NONEON-NOSVE-NEXT: ldrh w15, [sp, #638] +; NONEON-NOSVE-NEXT: stp q7, q21, [sp, #640] +; NONEON-NOSVE-NEXT: ldrh w16, [sp, #636] +; NONEON-NOSVE-NEXT: ldrh w17, [sp, #634] +; NONEON-NOSVE-NEXT: str w8, [sp, #388] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #442] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #666] +; NONEON-NOSVE-NEXT: str q3, [sp, #416] +; NONEON-NOSVE-NEXT: ldrh w18, [sp, #632] +; NONEON-NOSVE-NEXT: ldrh w0, [sp, #630] +; NONEON-NOSVE-NEXT: str w8, [sp, #384] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #444] +; NONEON-NOSVE-NEXT: ldrh w1, [sp, #628] +; NONEON-NOSVE-NEXT: ldrh w2, [sp, #626] +; NONEON-NOSVE-NEXT: ldrh w3, [sp, #624] +; NONEON-NOSVE-NEXT: ldrh w4, [sp, #622] +; NONEON-NOSVE-NEXT: str w8, [sp, #380] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #446] +; NONEON-NOSVE-NEXT: ldrh w5, [sp, #620] +; NONEON-NOSVE-NEXT: ldrh w6, [sp, #618] +; NONEON-NOSVE-NEXT: ldrh w7, [sp, #616] +; NONEON-NOSVE-NEXT: ldrh w19, [sp, #614] +; NONEON-NOSVE-NEXT: str w8, [sp, #376] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #480] +; NONEON-NOSVE-NEXT: ldrh w20, [sp, #612] +; NONEON-NOSVE-NEXT: ldrh w21, [sp, #610] +; NONEON-NOSVE-NEXT: ldrh w22, [sp, #608] +; NONEON-NOSVE-NEXT: ldrh w23, [sp, #430] +; NONEON-NOSVE-NEXT: str w8, [sp, #372] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #482] +; NONEON-NOSVE-NEXT: ldrh w24, [sp, #428] +; NONEON-NOSVE-NEXT: ldrh w25, [sp, #426] +; NONEON-NOSVE-NEXT: ldrh w26, [sp, #424] +; NONEON-NOSVE-NEXT: ldrh w27, [sp, #422] +; NONEON-NOSVE-NEXT: str w8, [sp, #368] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #484] +; NONEON-NOSVE-NEXT: ldrh w28, [sp, #420] +; NONEON-NOSVE-NEXT: ldrh w29, [sp, #418] +; NONEON-NOSVE-NEXT: strb w30, [sp, #767] +; NONEON-NOSVE-NEXT: str w8, [sp, #364] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #486] +; NONEON-NOSVE-NEXT: str w8, [sp, #360] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #488] +; NONEON-NOSVE-NEXT: str w8, [sp, #356] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #490] +; NONEON-NOSVE-NEXT: str w8, [sp, #352] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #492] +; NONEON-NOSVE-NEXT: str w8, [sp, #348] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #494] +; NONEON-NOSVE-NEXT: str w8, [sp, #344] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #448] +; NONEON-NOSVE-NEXT: str w8, [sp, #340] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #450] +; NONEON-NOSVE-NEXT: str w8, [sp, #336] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #452] +; NONEON-NOSVE-NEXT: str w8, [sp, #332] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #454] +; NONEON-NOSVE-NEXT: str w8, [sp, #328] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #456] +; NONEON-NOSVE-NEXT: str w8, [sp, #324] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #458] +; NONEON-NOSVE-NEXT: str w8, [sp, #320] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #460] +; NONEON-NOSVE-NEXT: str w8, [sp, #316] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #462] +; NONEON-NOSVE-NEXT: str w8, [sp, #312] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #464] +; NONEON-NOSVE-NEXT: str w8, [sp, #308] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #466] +; NONEON-NOSVE-NEXT: str w8, [sp, #304] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #468] +; NONEON-NOSVE-NEXT: str w8, [sp, #300] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #470] +; NONEON-NOSVE-NEXT: str w8, [sp, #296] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #472] +; NONEON-NOSVE-NEXT: str w8, [sp, #292] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #474] +; NONEON-NOSVE-NEXT: str w8, [sp, #288] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #476] +; NONEON-NOSVE-NEXT: str w8, [sp, #284] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #478] +; NONEON-NOSVE-NEXT: str w8, [sp, #280] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #656] +; NONEON-NOSVE-NEXT: str w8, [sp, #276] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #658] +; NONEON-NOSVE-NEXT: str w8, [sp, #272] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #660] +; NONEON-NOSVE-NEXT: str w8, [sp, #268] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #662] +; NONEON-NOSVE-NEXT: str w8, [sp, #264] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #664] +; NONEON-NOSVE-NEXT: str w8, [sp, #260] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #668] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #252] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #670] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #528] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #244] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #530] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #532] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #236] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #534] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #536] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #228] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #538] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #540] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #220] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #542] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #496] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #212] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #498] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #500] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #204] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #502] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #504] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #196] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #506] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #508] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #188] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #510] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #512] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #180] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #514] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #516] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #172] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #518] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #520] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #164] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #522] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #524] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #156] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #526] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #640] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #148] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #642] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #644] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #140] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #646] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #648] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #132] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #650] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #652] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #124] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #654] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #576] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #116] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #578] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #580] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #108] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #582] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #584] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #100] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #586] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #588] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #92] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #590] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #544] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #84] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #546] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #548] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #76] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #550] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #552] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #68] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #554] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #556] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #558] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #560] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #562] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #564] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #566] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #568] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #570] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #572] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #574] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #416] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #602] +; NONEON-NOSVE-NEXT: ldrh w9, [sp, #604] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #765] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #764] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strb w8, [sp, #763] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #762] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strb w8, [sp, #761] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: strb w8, [sp, #760] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strb w8, [sp, #759] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #758] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strb w8, [sp, #757] +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: strb w8, [sp, #756] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strb w8, [sp, #755] +; NONEON-NOSVE-NEXT: add w8, w1, w1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #754] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #753] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strb w8, [sp, #752] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #751] +; NONEON-NOSVE-NEXT: add w8, w5, w5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #750] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: strb w8, [sp, #749] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #748] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: strb w8, [sp, #747] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: strb w8, [sp, #746] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: strb w8, [sp, #745] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: strb w8, [sp, #744] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: strb w8, [sp, #743] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #742] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: strb w8, [sp, #741] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #740] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: strb w8, [sp, #739] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: strb w8, [sp, #738] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: strb w8, [sp, #737] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #766] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #736] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #736] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #735] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #734] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #733] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #732] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #731] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #730] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #729] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #728] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #727] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #726] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #725] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #724] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #723] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #722] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #721] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #720] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #783] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #92] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #782] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #781] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #780] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #779] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #778] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #777] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #776] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #775] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #124] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #774] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #773] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #132] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #772] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #771] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #140] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #770] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #769] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #148] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #768] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #152] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #719] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #156] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #718] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #717] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #164] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #716] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #168] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #715] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #172] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #714] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #713] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #180] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #712] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #184] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #711] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #188] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #710] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #709] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #196] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #708] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #200] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #707] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #204] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #706] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #208] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #705] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #212] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #704] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #216] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q6, q3, [sp, #704] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #799] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #220] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #798] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #224] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #797] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #228] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #796] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #232] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #795] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #236] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #794] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #240] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #793] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #244] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #792] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #248] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #791] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #252] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #790] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #256] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #789] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #260] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #788] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #264] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #787] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #268] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #786] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #272] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #785] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #276] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #784] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #280] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q4, q7, [sp, #768] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #687] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #284] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #686] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #288] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #685] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #292] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #684] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #296] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #683] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #300] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #682] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #304] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #681] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #308] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #680] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #312] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #679] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #316] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #678] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #320] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #677] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #324] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #676] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #328] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #675] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #332] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #674] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #336] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #673] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #340] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #672] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #344] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #703] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #348] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #702] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #352] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #701] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #356] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #700] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #360] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #699] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #364] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #698] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #368] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #697] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #372] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #696] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #376] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #695] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #380] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #694] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #384] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #693] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #388] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #692] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #392] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #691] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #396] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #690] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #400] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #689] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #404] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #688] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #408] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #672] +; NONEON-NOSVE-NEXT: stp q1, q0, [x8] +; NONEON-NOSVE-NEXT: stp q4, q3, [x8, #32] +; NONEON-NOSVE-NEXT: stp q7, q6, [x8, #64] +; NONEON-NOSVE-NEXT: stp q2, q5, [x8, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #800 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: ret %a = load <128 x i16>, ptr %in %b = trunc <128 x i16> %a to <128 x i8> @@ -227,8 +1193,21 @@ define <8 x i8> @trunc_v8i32_v8i8(ptr %in) nounwind { ; NONEON-NOSVE-LABEL: trunc_v8i32_v8i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: xtn v0.8b, v0.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: strb w9, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: strb w9, [sp, #45] +; NONEON-NOSVE-NEXT: strb w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w9, [sp, #43] +; NONEON-NOSVE-NEXT: strb w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: strb w9, [sp, #41] +; NONEON-NOSVE-NEXT: strb w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i32>, ptr %in %b = trunc <8 x i32> %a to <8 x i8> @@ -256,11 +1235,38 @@ define <16 x i8> @trunc_v16i32_v16i8(ptr %in) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v16i32_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v2.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: strb w9, [sp, #77] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: strb w9, [sp, #75] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #48] +; NONEON-NOSVE-NEXT: strb w9, [sp, #73] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: strb w9, [sp, #71] +; NONEON-NOSVE-NEXT: strb w8, [sp, #70] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #32] +; NONEON-NOSVE-NEXT: strb w9, [sp, #69] +; NONEON-NOSVE-NEXT: strb w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: strb w9, [sp, #67] +; NONEON-NOSVE-NEXT: strb w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: strb w9, [sp, #65] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i32>, ptr %in %b = trunc <16 x i32> %a to <16 x i8> @@ -302,19 +1308,113 @@ define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v32i32_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: sub sp, sp, #272 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #192] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v7.8h, v6.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v3.16b, v1.16b -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #208] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #224] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #240] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #96] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #80] +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #112] +; NONEON-NOSVE-NEXT: stp q5, q7, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldp w27, w28, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w25, w26, [sp, #104] +; NONEON-NOSVE-NEXT: add w6, w8, w8 +; NONEON-NOSVE-NEXT: add w5, w9, w9 +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #256] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w10, w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp w23, w24, [sp, #96] +; NONEON-NOSVE-NEXT: ldp w21, w22, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #120] +; NONEON-NOSVE-NEXT: stp q6, q0, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w19, w20, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #176] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: strb w8, [sp, #155] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: strb w9, [sp, #154] +; NONEON-NOSVE-NEXT: add w9, w27, w27 +; NONEON-NOSVE-NEXT: strb w8, [sp, #153] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: strb w9, [sp, #152] +; NONEON-NOSVE-NEXT: add w9, w25, w25 +; NONEON-NOSVE-NEXT: ldp w4, w7, [sp, #56] +; NONEON-NOSVE-NEXT: strb w8, [sp, #151] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: strb w9, [sp, #150] +; NONEON-NOSVE-NEXT: add w9, w23, w23 +; NONEON-NOSVE-NEXT: ldp w2, w3, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #149] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: strb w9, [sp, #148] +; NONEON-NOSVE-NEXT: add w9, w21, w21 +; NONEON-NOSVE-NEXT: ldp w18, w0, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #147] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: strb w9, [sp, #146] +; NONEON-NOSVE-NEXT: add w9, w19, w19 +; NONEON-NOSVE-NEXT: ldp w16, w17, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #145] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strb w9, [sp, #144] +; NONEON-NOSVE-NEXT: add w9, w4, w4 +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #72] +; NONEON-NOSVE-NEXT: strb w8, [sp, #175] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strb w9, [sp, #174] +; NONEON-NOSVE-NEXT: add w9, w2, w2 +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #64] +; NONEON-NOSVE-NEXT: strb w8, [sp, #173] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strb w9, [sp, #172] +; NONEON-NOSVE-NEXT: add w9, w18, w18 +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #136] +; NONEON-NOSVE-NEXT: strb w8, [sp, #171] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #170] +; NONEON-NOSVE-NEXT: add w9, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #169] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strb w9, [sp, #168] +; NONEON-NOSVE-NEXT: add w9, w14, w14 +; NONEON-NOSVE-NEXT: strb w8, [sp, #167] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strb w9, [sp, #166] +; NONEON-NOSVE-NEXT: add w9, w12, w12 +; NONEON-NOSVE-NEXT: ldp w29, w30, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #165] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strb w9, [sp, #164] +; NONEON-NOSVE-NEXT: add w9, w10, w10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #163] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #162] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w5, [sp, #159] +; NONEON-NOSVE-NEXT: add w5, w30, w30 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w6, [sp, #158] +; NONEON-NOSVE-NEXT: add w6, w29, w29 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w5, [sp, #157] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #256] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w6, [sp, #156] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #240] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #161] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #160] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #208] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #144] +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #192] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #176] // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #272 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i32>, ptr %in %b = trunc <32 x i32> %a to <32 x i8> @@ -383,32 +1483,273 @@ define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v64i32_v64i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #128] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #160] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #192] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #224] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: ldp q3, q1, [x0] -; NONEON-NOSVE-NEXT: uzp1 v4.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v6.8h, v7.8h, v6.8h -; NONEON-NOSVE-NEXT: ldp q16, q7, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #96] -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v1.8h -; NONEON-NOSVE-NEXT: uzp1 v5.8h, v17.8h, v5.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; NONEON-NOSVE-NEXT: uzp1 v7.8h, v16.8h, v7.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v19.8h, v18.8h -; NONEON-NOSVE-NEXT: uzp1 v2.16b, v4.16b, v6.16b -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v1.16b, v7.16b -; NONEON-NOSVE-NEXT: uzp1 v3.16b, v5.16b, v3.16b -; NONEON-NOSVE-NEXT: add v2.16b, v2.16b, v2.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add v3.16b, v3.16b, v3.16b -; NONEON-NOSVE-NEXT: stp q1, q3, [x1] +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #480 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #96] +; NONEON-NOSVE-NEXT: str x1, [sp, #152] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #128] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0] +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #224] +; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q23, q22, [x0, #160] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #288] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #316] +; NONEON-NOSVE-NEXT: str q18, [sp, #208] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #304] +; NONEON-NOSVE-NEXT: stp q21, q19, [sp, #176] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #296] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #292] +; NONEON-NOSVE-NEXT: add w20, w8, w8 +; NONEON-NOSVE-NEXT: stp q20, q23, [sp, #224] +; NONEON-NOSVE-NEXT: ldr w13, [sp, #288] +; NONEON-NOSVE-NEXT: stp q22, q16, [sp, #256] +; NONEON-NOSVE-NEXT: ldr w22, [sp, #312] +; NONEON-NOSVE-NEXT: stp q3, q17, [sp, #384] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #400] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #404] +; NONEON-NOSVE-NEXT: str q7, [sp, #160] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #320] +; NONEON-NOSVE-NEXT: ldr w18, [sp, #396] +; NONEON-NOSVE-NEXT: ldr w0, [sp, #392] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #408] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #412] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #332] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #328] +; NONEON-NOSVE-NEXT: ldr w16, [sp, #324] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #272] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #276] +; NONEON-NOSVE-NEXT: ldr w17, [sp, #320] +; NONEON-NOSVE-NEXT: ldr w1, [sp, #388] +; NONEON-NOSVE-NEXT: ldr w2, [sp, #384] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #280] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #284] +; NONEON-NOSVE-NEXT: ldr w3, [sp, #348] +; NONEON-NOSVE-NEXT: ldr w4, [sp, #344] +; NONEON-NOSVE-NEXT: ldr w5, [sp, #340] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w6, [sp, #336] +; NONEON-NOSVE-NEXT: stp q6, q5, [sp, #352] +; NONEON-NOSVE-NEXT: ldr w7, [sp, #380] +; NONEON-NOSVE-NEXT: ldr w19, [sp, #376] +; NONEON-NOSVE-NEXT: ldr w21, [sp, #372] +; NONEON-NOSVE-NEXT: ldr w23, [sp, #368] +; NONEON-NOSVE-NEXT: ldr w24, [sp, #364] +; NONEON-NOSVE-NEXT: ldr w25, [sp, #360] +; NONEON-NOSVE-NEXT: ldr w26, [sp, #356] +; NONEON-NOSVE-NEXT: ldr w27, [sp, #352] +; NONEON-NOSVE-NEXT: strb w20, [sp, #463] +; NONEON-NOSVE-NEXT: add w20, w22, w22 +; NONEON-NOSVE-NEXT: strb w20, [sp, #462] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #240] +; NONEON-NOSVE-NEXT: ldp w29, w28, [sp, #168] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #248] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #256] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #260] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #264] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #268] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #176] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #184] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #224] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #232] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #192] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #200] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #208] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #216] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #300] +; NONEON-NOSVE-NEXT: ldp w8, w30, [sp, #160] +; NONEON-NOSVE-NEXT: str w8, [sp, #20] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w8, [sp, #308] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #461] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #460] +; NONEON-NOSVE-NEXT: add w8, w9, w9 +; NONEON-NOSVE-NEXT: strb w8, [sp, #459] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strb w8, [sp, #458] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #457] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strb w8, [sp, #456] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: strb w8, [sp, #455] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strb w8, [sp, #454] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #453] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strb w8, [sp, #452] +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: strb w8, [sp, #451] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strb w8, [sp, #450] +; NONEON-NOSVE-NEXT: add w8, w1, w1 +; NONEON-NOSVE-NEXT: strb w8, [sp, #449] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: strb w8, [sp, #448] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strb w8, [sp, #447] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: strb w8, [sp, #446] +; NONEON-NOSVE-NEXT: add w8, w5, w5 +; NONEON-NOSVE-NEXT: strb w8, [sp, #445] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: strb w8, [sp, #444] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strb w8, [sp, #443] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: strb w8, [sp, #442] +; NONEON-NOSVE-NEXT: add w8, w21, w21 +; NONEON-NOSVE-NEXT: strb w8, [sp, #441] +; NONEON-NOSVE-NEXT: add w8, w23, w23 +; NONEON-NOSVE-NEXT: strb w8, [sp, #440] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: strb w8, [sp, #439] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: strb w8, [sp, #438] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: strb w8, [sp, #437] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: strb w8, [sp, #436] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: strb w8, [sp, #435] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: strb w8, [sp, #434] +; NONEON-NOSVE-NEXT: add w8, w30, w30 +; NONEON-NOSVE-NEXT: strb w8, [sp, #433] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #432] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #431] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #430] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #429] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #428] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #427] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #426] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #425] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #424] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #423] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #422] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #421] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #420] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #419] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #418] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #417] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #416] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q3, [sp, #416] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #479] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #92] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #478] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #477] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #476] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #475] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #474] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #473] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #472] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #471] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #124] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #470] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #469] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #132] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #468] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #467] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #140] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #466] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #144] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #465] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #148] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w8, [sp, #464] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #152] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q2, q0, [sp, #448] +; NONEON-NOSVE-NEXT: stp q3, q2, [x8] +; NONEON-NOSVE-NEXT: stp q0, q1, [x8, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #480 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: ret %a = load <64 x i32>, ptr %in %b = trunc <64 x i32> %a to <64 x i8> @@ -435,7 +1776,21 @@ define <8 x i16> @trunc_v8i32_v8i16(ptr %in) nounwind { ; NONEON-NOSVE-LABEL: trunc_v8i32_v8i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w9, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #38] +; NONEON-NOSVE-NEXT: strh w8, [sp, #36] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp] +; NONEON-NOSVE-NEXT: strh w9, [sp, #34] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i32>, ptr %in %b = trunc <8 x i32> %a to <8 x i16> @@ -462,13 +1817,54 @@ define void @trunc_v16i32_v16i16(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v16i32_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h +; NONEON-NOSVE-NEXT: stp q3, q1, [sp] +; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldp w2, w3, [sp, #32] +; NONEON-NOSVE-NEXT: ldp w4, w5, [sp, #8] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldp w18, w0, [sp] +; NONEON-NOSVE-NEXT: ldp w16, w17, [sp, #24] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strh w9, [sp, #76] +; NONEON-NOSVE-NEXT: add w9, w2, w2 +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #74] +; NONEON-NOSVE-NEXT: add w8, w5, w5 +; NONEON-NOSVE-NEXT: strh w9, [sp, #72] +; NONEON-NOSVE-NEXT: add w9, w4, w4 +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #70] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strh w9, [sp, #68] +; NONEON-NOSVE-NEXT: add w9, w18, w18 +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strh w9, [sp, #64] +; NONEON-NOSVE-NEXT: add w9, w16, w16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #94] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strh w9, [sp, #92] +; NONEON-NOSVE-NEXT: add w9, w14, w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #90] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strh w9, [sp, #88] +; NONEON-NOSVE-NEXT: add w9, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #86] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strh w9, [sp, #84] +; NONEON-NOSVE-NEXT: add w9, w10, w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #82] +; NONEON-NOSVE-NEXT: strh w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i32>, ptr %in %b = trunc <16 x i32> %a to <16 x i16> @@ -508,20 +1904,115 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v32i32_v32i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: sub sp, sp, #304 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #224] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: ldp q6, q1, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v6.8h, v1.8h -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v2.8h, v2.8h, v2.8h -; NONEON-NOSVE-NEXT: add v3.8h, v3.8h, v3.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q3, q1, [x1] +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #240] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #256] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #272] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #96] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #80] +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #112] +; NONEON-NOSVE-NEXT: stp q5, q7, [sp, #16] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: ldp w27, w28, [sp, #112] +; NONEON-NOSVE-NEXT: ldp w25, w26, [sp, #104] +; NONEON-NOSVE-NEXT: add w6, w8, w8 +; NONEON-NOSVE-NEXT: add w5, w9, w9 +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #288] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w10, w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldp w23, w24, [sp, #96] +; NONEON-NOSVE-NEXT: ldp w21, w22, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #120] +; NONEON-NOSVE-NEXT: stp q6, q0, [sp, #48] +; NONEON-NOSVE-NEXT: ldp w19, w20, [sp, #16] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #208] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: strh w8, [sp, #182] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: strh w9, [sp, #180] +; NONEON-NOSVE-NEXT: add w9, w27, w27 +; NONEON-NOSVE-NEXT: strh w8, [sp, #178] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: strh w9, [sp, #176] +; NONEON-NOSVE-NEXT: add w9, w25, w25 +; NONEON-NOSVE-NEXT: ldp w4, w7, [sp, #56] +; NONEON-NOSVE-NEXT: strh w8, [sp, #174] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: strh w9, [sp, #172] +; NONEON-NOSVE-NEXT: add w9, w23, w23 +; NONEON-NOSVE-NEXT: ldp w2, w3, [sp, #48] +; NONEON-NOSVE-NEXT: strh w8, [sp, #170] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: strh w9, [sp, #168] +; NONEON-NOSVE-NEXT: add w9, w21, w21 +; NONEON-NOSVE-NEXT: ldp w18, w0, [sp, #40] +; NONEON-NOSVE-NEXT: strh w8, [sp, #166] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: strh w9, [sp, #164] +; NONEON-NOSVE-NEXT: add w9, w19, w19 +; NONEON-NOSVE-NEXT: ldp w16, w17, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #162] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strh w9, [sp, #160] +; NONEON-NOSVE-NEXT: add w9, w4, w4 +; NONEON-NOSVE-NEXT: ldp w14, w15, [sp, #72] +; NONEON-NOSVE-NEXT: strh w8, [sp, #158] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strh w9, [sp, #156] +; NONEON-NOSVE-NEXT: add w9, w2, w2 +; NONEON-NOSVE-NEXT: ldp w12, w13, [sp, #64] +; NONEON-NOSVE-NEXT: strh w8, [sp, #154] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strh w9, [sp, #152] +; NONEON-NOSVE-NEXT: add w9, w18, w18 +; NONEON-NOSVE-NEXT: ldp w10, w11, [sp, #136] +; NONEON-NOSVE-NEXT: strh w8, [sp, #150] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strh w9, [sp, #148] +; NONEON-NOSVE-NEXT: add w9, w16, w16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #146] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strh w9, [sp, #144] +; NONEON-NOSVE-NEXT: add w9, w14, w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #206] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strh w9, [sp, #204] +; NONEON-NOSVE-NEXT: add w9, w12, w12 +; NONEON-NOSVE-NEXT: ldp w29, w30, [sp, #80] +; NONEON-NOSVE-NEXT: strh w8, [sp, #202] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strh w9, [sp, #200] +; NONEON-NOSVE-NEXT: add w9, w10, w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #198] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w9, [sp, #196] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w5, [sp, #190] +; NONEON-NOSVE-NEXT: add w5, w30, w30 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w6, [sp, #188] +; NONEON-NOSVE-NEXT: add w6, w29, w29 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w5, [sp, #186] +; NONEON-NOSVE-NEXT: ldp q1, q3, [sp, #144] +; NONEON-NOSVE-NEXT: strh w6, [sp, #184] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #288] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #194] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #272] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w9, [sp, #192] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #256] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q2, q0, [sp, #176] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #240] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #224] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #208] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q3, q2, [x1] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #304 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i32>, ptr %in %b = trunc <32 x i32> %a to <32 x i16> @@ -583,34 +2074,276 @@ define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v64i32_v64i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #192] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #224] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #128] -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: ldp q16, q1, [x0, #160] -; NONEON-NOSVE-NEXT: uzp1 v4.8h, v5.8h, v4.8h -; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v6.8h, v7.8h, v6.8h +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #-96]! // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #16] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #32] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #48] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: sub sp, sp, #528 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: mov x5, x1 +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q23, q22, [x0, #224] ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q18, q7, [x0, #96] -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v16.8h, v1.8h -; NONEON-NOSVE-NEXT: uzp1 v5.8h, v17.8h, v5.8h -; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v4.8h, v4.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp1 v7.8h, v18.8h, v7.8h -; NONEON-NOSVE-NEXT: add v3.8h, v6.8h, v6.8h -; NONEON-NOSVE-NEXT: uzp1 v6.8h, v17.8h, v16.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #96] -; NONEON-NOSVE-NEXT: add v0.8h, v5.8h, v5.8h -; NONEON-NOSVE-NEXT: add v2.8h, v2.8h, v2.8h -; NONEON-NOSVE-NEXT: add v4.8h, v7.8h, v7.8h -; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #64] -; NONEON-NOSVE-NEXT: add v1.8h, v6.8h, v6.8h -; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #32] -; NONEON-NOSVE-NEXT: stp q2, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #96] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #160] +; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #128] +; NONEON-NOSVE-NEXT: str q0, [sp, #320] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #332] +; NONEON-NOSVE-NEXT: stp q17, q23, [sp, #160] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #320] +; NONEON-NOSVE-NEXT: stp q22, q16, [sp, #192] +; NONEON-NOSVE-NEXT: ldr w23, [sp, #328] +; NONEON-NOSVE-NEXT: add w21, w8, w8 +; NONEON-NOSVE-NEXT: stp q18, q20, [sp, #240] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #160] +; NONEON-NOSVE-NEXT: stp q7, q21, [sp, #368] +; NONEON-NOSVE-NEXT: str q19, [sp, #224] +; NONEON-NOSVE-NEXT: ldr w29, [sp, #380] +; NONEON-NOSVE-NEXT: ldr w30, [sp, #376] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #168] +; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #288] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #336] +; NONEON-NOSVE-NEXT: ldr w3, [sp, #300] +; NONEON-NOSVE-NEXT: ldr w4, [sp, #296] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w11, [sp, #360] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #356] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #208] +; NONEON-NOSVE-NEXT: ldr w13, [sp, #352] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #348] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #344] +; NONEON-NOSVE-NEXT: str q3, [sp, #144] +; NONEON-NOSVE-NEXT: ldr w16, [sp, #340] +; NONEON-NOSVE-NEXT: ldr w17, [sp, #336] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #120] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w6, [sp, #292] +; NONEON-NOSVE-NEXT: ldr w7, [sp, #288] +; NONEON-NOSVE-NEXT: str q5, [sp, #272] +; NONEON-NOSVE-NEXT: ldr w25, [sp, #316] +; NONEON-NOSVE-NEXT: ldr w26, [sp, #312] +; NONEON-NOSVE-NEXT: ldr w19, [sp, #284] +; NONEON-NOSVE-NEXT: ldr w20, [sp, #280] +; NONEON-NOSVE-NEXT: ldr w22, [sp, #276] +; NONEON-NOSVE-NEXT: ldr w24, [sp, #272] +; NONEON-NOSVE-NEXT: ldr w27, [sp, #308] +; NONEON-NOSVE-NEXT: ldr w28, [sp, #304] +; NONEON-NOSVE-NEXT: strh w21, [sp, #494] +; NONEON-NOSVE-NEXT: add w21, w23, w23 +; NONEON-NOSVE-NEXT: strh w21, [sp, #492] +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #216] +; NONEON-NOSVE-NEXT: ldp w0, w18, [sp, #152] +; NONEON-NOSVE-NEXT: ldp w2, w1, [sp, #144] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #112] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #176] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #104] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #184] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #96] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #192] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #200] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #384] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #388] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #392] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #396] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #256] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #260] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #56] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #264] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #268] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #48] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #224] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #232] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #32] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #240] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp, #248] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w9, [sp, #368] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #372] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w8, [sp, #324] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #364] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #490] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #488] +; NONEON-NOSVE-NEXT: add w8, w9, w9 +; NONEON-NOSVE-NEXT: strh w8, [sp, #486] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #484] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #482] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #480] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #478] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #476] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #474] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strh w8, [sp, #472] +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: strh w8, [sp, #470] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #468] +; NONEON-NOSVE-NEXT: add w8, w1, w1 +; NONEON-NOSVE-NEXT: strh w8, [sp, #466] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: strh w8, [sp, #464] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #462] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: strh w8, [sp, #460] +; NONEON-NOSVE-NEXT: add w8, w6, w6 +; NONEON-NOSVE-NEXT: strh w8, [sp, #458] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strh w8, [sp, #456] +; NONEON-NOSVE-NEXT: add w8, w19, w19 +; NONEON-NOSVE-NEXT: strh w8, [sp, #454] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: strh w8, [sp, #452] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: strh w8, [sp, #450] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: strh w8, [sp, #448] +; NONEON-NOSVE-NEXT: add w8, w25, w25 +; NONEON-NOSVE-NEXT: strh w8, [sp, #510] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: strh w8, [sp, #508] +; NONEON-NOSVE-NEXT: add w8, w27, w27 +; NONEON-NOSVE-NEXT: strh w8, [sp, #506] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: strh w8, [sp, #504] +; NONEON-NOSVE-NEXT: add w8, w29, w29 +; NONEON-NOSVE-NEXT: strh w8, [sp, #502] +; NONEON-NOSVE-NEXT: add w8, w30, w30 +; NONEON-NOSVE-NEXT: strh w8, [sp, #500] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #464] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #498] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #496] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #446] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #444] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #442] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #28] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #440] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #438] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #36] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #436] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #434] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #432] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #48] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q6, q3, [sp, #432] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #526] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #52] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #524] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #522] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #60] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #520] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #518] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #68] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #516] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #72] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #514] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #76] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #512] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #80] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q4, q7, [sp, #496] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #414] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #84] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #412] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #88] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #410] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #92] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #408] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #96] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #406] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #100] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #404] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #104] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #402] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #108] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #400] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #112] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #430] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #116] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #428] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #120] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #426] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #124] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #424] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #128] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #422] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #132] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #420] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #136] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #418] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #140] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w8, [sp, #416] +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #400] +; NONEON-NOSVE-NEXT: stp q1, q0, [x5] +; NONEON-NOSVE-NEXT: stp q4, q3, [x5, #32] +; NONEON-NOSVE-NEXT: stp q7, q6, [x5, #64] +; NONEON-NOSVE-NEXT: stp q2, q5, [x5, #96] +; NONEON-NOSVE-NEXT: add sp, sp, #528 +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #64] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #48] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #32] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #16] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp], #96 // 16-byte Folded Reload ; NONEON-NOSVE-NEXT: ret %a = load <64 x i32>, ptr %in %b = trunc <64 x i32> %a to <64 x i16> @@ -639,8 +2372,15 @@ define <4 x i8> @trunc_v4i64_v4i8(ptr %in) nounwind { ; NONEON-NOSVE-LABEL: trunc_v4i64_v4i8: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp] +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: strh w10, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i64>, ptr %in %b = trunc <4 x i64> %a to <4 x i8> @@ -669,12 +2409,27 @@ define <8 x i8> @trunc_v8i64_v8i8(ptr %in) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v8i64_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; NONEON-NOSVE-NEXT: xtn v0.8b, v0.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #48] +; NONEON-NOSVE-NEXT: strb w9, [sp, #79] +; NONEON-NOSVE-NEXT: strb w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: strb w10, [sp, #77] +; NONEON-NOSVE-NEXT: strb w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w11, [sp, #75] +; NONEON-NOSVE-NEXT: strb w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #72] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i64>, ptr %in %b = trunc <8 x i64> %a to <8 x i8> @@ -717,17 +2472,47 @@ define <16 x i8> @trunc_v16i64_v16i8(ptr %in) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v16i64_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp1 v3.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v4.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; NONEON-NOSVE-NEXT: sub sp, sp, #144 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #96] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: ldp q4, q5, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q6, q7, [x0, #64] +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: str q3, [sp, #80] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: stp q7, q5, [sp, #48] +; NONEON-NOSVE-NEXT: strb w8, [sp, #142] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #16] +; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #96] +; NONEON-NOSVE-NEXT: strb w9, [sp, #143] +; NONEON-NOSVE-NEXT: strb w8, [sp, #140] +; NONEON-NOSVE-NEXT: ldp x8, x11, [sp, #48] +; NONEON-NOSVE-NEXT: strb w10, [sp, #141] +; NONEON-NOSVE-NEXT: strb w8, [sp, #138] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #120] +; NONEON-NOSVE-NEXT: strb w11, [sp, #139] +; NONEON-NOSVE-NEXT: strb w8, [sp, #137] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #112] +; NONEON-NOSVE-NEXT: strb w8, [sp, #136] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #72] +; NONEON-NOSVE-NEXT: strb w8, [sp, #135] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #64] +; NONEON-NOSVE-NEXT: strb w8, [sp, #134] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #104] +; NONEON-NOSVE-NEXT: strb w8, [sp, #133] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #96] +; NONEON-NOSVE-NEXT: strb w8, [sp, #132] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #88] +; NONEON-NOSVE-NEXT: strb w8, [sp, #131] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #80] +; NONEON-NOSVE-NEXT: strb w8, [sp, #130] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: strb w8, [sp, #129] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: strb w8, [sp, #128] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #128] +; NONEON-NOSVE-NEXT: add sp, sp, #144 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i64>, ptr %in %b = trunc <16 x i64> %a to <16 x i8> @@ -798,31 +2583,139 @@ define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #224] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #192] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #96] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #128] -; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #160] -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp1 v16.4s, v17.4s, v16.4s -; NONEON-NOSVE-NEXT: uzp1 v5.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v7.4s, v19.4s, v18.4s -; NONEON-NOSVE-NEXT: uzp1 v6.4s, v21.4s, v20.4s -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v4.8h, v16.8h -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v2.8h, v7.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v6.8h, v5.8h -; NONEON-NOSVE-NEXT: uzp1 v0.16b, v1.16b, v0.16b -; NONEON-NOSVE-NEXT: uzp1 v1.16b, v2.16b, v3.16b -; NONEON-NOSVE-NEXT: add v0.16b, v0.16b, v0.16b -; NONEON-NOSVE-NEXT: add v1.16b, v1.16b, v1.16b -; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: sub sp, sp, #416 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #96] +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #336] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #64] +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #352] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #32] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #384] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #128] +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #400] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0] +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #320] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #224] +; NONEON-NOSVE-NEXT: str x1, [sp, #24] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q23, q22, [x0, #160] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #160] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #176] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #184] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #192] +; NONEON-NOSVE-NEXT: stp q21, q19, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w25, [sp, #208] +; NONEON-NOSVE-NEXT: ldr w26, [sp, #216] +; NONEON-NOSVE-NEXT: add w5, w9, w9 +; NONEON-NOSVE-NEXT: add w6, w8, w8 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #192] +; NONEON-NOSVE-NEXT: stp q20, q23, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w2, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w16, [sp, #48] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldr w18, [sp, #96] +; NONEON-NOSVE-NEXT: stp q22, q16, [sp, #128] +; NONEON-NOSVE-NEXT: stp q6, q5, [sp, #224] +; NONEON-NOSVE-NEXT: ldr w3, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #128] +; NONEON-NOSVE-NEXT: stp q3, q17, [sp, #256] +; NONEON-NOSVE-NEXT: ldr w23, [sp, #240] +; NONEON-NOSVE-NEXT: ldr w21, [sp, #224] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #272] +; NONEON-NOSVE-NEXT: ldr w27, [sp, #256] +; NONEON-NOSVE-NEXT: ldr w28, [sp, #264] +; NONEON-NOSVE-NEXT: strb w9, [sp, #298] +; NONEON-NOSVE-NEXT: ldr w24, [sp, #248] +; NONEON-NOSVE-NEXT: ldr w22, [sp, #232] +; NONEON-NOSVE-NEXT: add w9, w27, w27 +; NONEON-NOSVE-NEXT: str w8, [sp, #20] // 4-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w8, [sp, #200] +; NONEON-NOSVE-NEXT: str q7, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w0, [sp, #104] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #112] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strb w9, [sp, #296] +; NONEON-NOSVE-NEXT: add w9, w25, w25 +; NONEON-NOSVE-NEXT: str q18, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w19, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w20, [sp, #40] +; NONEON-NOSVE-NEXT: strb w8, [sp, #299] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: ldr w4, [sp, #80] +; NONEON-NOSVE-NEXT: strb w9, [sp, #294] +; NONEON-NOSVE-NEXT: add w9, w23, w23 +; NONEON-NOSVE-NEXT: ldr w7, [sp, #88] +; NONEON-NOSVE-NEXT: strb w8, [sp, #297] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: ldr w17, [sp, #56] +; NONEON-NOSVE-NEXT: strb w9, [sp, #292] +; NONEON-NOSVE-NEXT: add w9, w21, w21 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #144] +; NONEON-NOSVE-NEXT: strb w8, [sp, #295] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: ldr w15, [sp, #136] +; NONEON-NOSVE-NEXT: strb w9, [sp, #290] +; NONEON-NOSVE-NEXT: add w9, w19, w19 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #120] +; NONEON-NOSVE-NEXT: strb w8, [sp, #293] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #152] +; NONEON-NOSVE-NEXT: strb w9, [sp, #288] +; NONEON-NOSVE-NEXT: add w9, w4, w4 +; NONEON-NOSVE-NEXT: ldr w1, [sp, #280] +; NONEON-NOSVE-NEXT: strb w8, [sp, #291] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: ldr w29, [sp, #160] +; NONEON-NOSVE-NEXT: strb w9, [sp, #318] +; NONEON-NOSVE-NEXT: add w9, w2, w2 +; NONEON-NOSVE-NEXT: ldr w30, [sp, #168] +; NONEON-NOSVE-NEXT: strb w8, [sp, #289] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: strb w9, [sp, #316] +; NONEON-NOSVE-NEXT: add w9, w18, w18 +; NONEON-NOSVE-NEXT: strb w8, [sp, #319] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strb w9, [sp, #314] +; NONEON-NOSVE-NEXT: add w9, w16, w16 +; NONEON-NOSVE-NEXT: strb w8, [sp, #317] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strb w9, [sp, #312] +; NONEON-NOSVE-NEXT: add w9, w14, w14 +; NONEON-NOSVE-NEXT: strb w8, [sp, #315] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strb w9, [sp, #310] +; NONEON-NOSVE-NEXT: add w9, w12, w12 +; NONEON-NOSVE-NEXT: strb w8, [sp, #313] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strb w9, [sp, #308] +; NONEON-NOSVE-NEXT: add w9, w10, w10 +; NONEON-NOSVE-NEXT: strb w8, [sp, #311] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strb w9, [sp, #306] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #20] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #309] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strb w5, [sp, #303] +; NONEON-NOSVE-NEXT: add w5, w30, w30 +; NONEON-NOSVE-NEXT: strb w6, [sp, #302] +; NONEON-NOSVE-NEXT: add w6, w29, w29 +; NONEON-NOSVE-NEXT: strb w8, [sp, #307] +; NONEON-NOSVE-NEXT: add w8, w1, w1 +; NONEON-NOSVE-NEXT: strb w5, [sp, #301] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #400] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w6, [sp, #300] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #384] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w8, [sp, #305] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #24] // 8-byte Folded Reload +; NONEON-NOSVE-NEXT: strb w9, [sp, #304] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #288] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #352] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #336] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q1, q0, [x8] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #320] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #416 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i64>, ptr %in %b = trunc <32 x i64> %a to <32 x i8> @@ -850,8 +2743,15 @@ define <4 x i16> @trunc_v4i64_v4i16(ptr %in) nounwind { ; NONEON-NOSVE-LABEL: trunc_v4i64_v4i16: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: xtn v0.4h, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp] +; NONEON-NOSVE-NEXT: strh w9, [sp, #46] +; NONEON-NOSVE-NEXT: strh w10, [sp, #42] +; NONEON-NOSVE-NEXT: strh w8, [sp, #40] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i64>, ptr %in %b = trunc <4 x i64> %a to <4 x i16> @@ -879,11 +2779,27 @@ define <8 x i16> @trunc_v8i64_v8i16(ptr %in) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v8i64_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: ldp q2, q3, [x0] +; NONEON-NOSVE-NEXT: str q1, [sp, #48] +; NONEON-NOSVE-NEXT: stp q0, q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp, #48] +; NONEON-NOSVE-NEXT: strh w9, [sp, #78] +; NONEON-NOSVE-NEXT: strh w8, [sp, #72] +; NONEON-NOSVE-NEXT: ldp x8, x11, [sp, #32] +; NONEON-NOSVE-NEXT: strh w10, [sp, #74] +; NONEON-NOSVE-NEXT: strh w8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #8] +; NONEON-NOSVE-NEXT: strh w11, [sp, #70] +; NONEON-NOSVE-NEXT: strh w8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i64>, ptr %in %b = trunc <8 x i64> %a to <8 x i16> @@ -925,19 +2841,66 @@ define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v16i64_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: sub sp, sp, #160 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] ; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v3.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v3.8h, v1.8h -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #96] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #64] +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w2, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w3, [sp, #104] +; NONEON-NOSVE-NEXT: stp q5, q7, [sp] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldr w4, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w5, [sp, #88] +; NONEON-NOSVE-NEXT: stp q6, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w18, [sp] +; NONEON-NOSVE-NEXT: ldr w0, [sp, #8] +; NONEON-NOSVE-NEXT: strh w9, [sp, #142] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: strh w8, [sp, #140] +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldr w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w17, [sp, #40] +; NONEON-NOSVE-NEXT: strh w9, [sp, #138] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: strh w8, [sp, #136] +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldr w14, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #24] +; NONEON-NOSVE-NEXT: strh w9, [sp, #134] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: strh w8, [sp, #132] +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w13, [sp, #56] +; NONEON-NOSVE-NEXT: strh w9, [sp, #130] +; NONEON-NOSVE-NEXT: add w9, w17, w17 +; NONEON-NOSVE-NEXT: strh w8, [sp, #128] +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #120] +; NONEON-NOSVE-NEXT: strh w9, [sp, #158] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: strh w8, [sp, #156] +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: strh w9, [sp, #154] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: strh w8, [sp, #152] +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: strh w9, [sp, #150] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: strh w8, [sp, #148] +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: strh w9, [sp, #146] +; NONEON-NOSVE-NEXT: strh w8, [sp, #144] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #128] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #160 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i64>, ptr %in %b = trunc <16 x i64> %a to <16 x i16> @@ -1006,32 +2969,140 @@ define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #128] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #160] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #192] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #224] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: ldp q3, q1, [x0] -; NONEON-NOSVE-NEXT: uzp1 v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v6.4s, v7.4s, v6.4s -; NONEON-NOSVE-NEXT: ldp q16, q7, [x0, #32] -; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #96] -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v3.4s, v1.4s -; NONEON-NOSVE-NEXT: uzp1 v5.4s, v17.4s, v5.4s -; NONEON-NOSVE-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; NONEON-NOSVE-NEXT: uzp1 v7.4s, v16.4s, v7.4s -; NONEON-NOSVE-NEXT: uzp1 v3.4s, v19.4s, v18.4s -; NONEON-NOSVE-NEXT: uzp1 v2.8h, v4.8h, v6.8h -; NONEON-NOSVE-NEXT: add v0.8h, v0.8h, v0.8h -; NONEON-NOSVE-NEXT: uzp1 v1.8h, v1.8h, v7.8h -; NONEON-NOSVE-NEXT: uzp1 v3.8h, v5.8h, v3.8h -; NONEON-NOSVE-NEXT: add v2.8h, v2.8h, v2.8h -; NONEON-NOSVE-NEXT: add v1.8h, v1.8h, v1.8h -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: add v3.8h, v3.8h, v3.8h -; NONEON-NOSVE-NEXT: stp q1, q3, [x1] +; NONEON-NOSVE-NEXT: sub sp, sp, #432 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #96] +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #352] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #64] +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #368] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #384] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #128] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #400] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #32] +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #416] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0] +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #336] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #224] +; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #192] +; NONEON-NOSVE-NEXT: ldp q23, q22, [x0, #160] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #144] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #160] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #168] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #176] +; NONEON-NOSVE-NEXT: stp q21, q19, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w25, [sp, #192] +; NONEON-NOSVE-NEXT: ldr w26, [sp, #200] +; NONEON-NOSVE-NEXT: add w6, w8, w8 +; NONEON-NOSVE-NEXT: add w5, w9, w9 +; NONEON-NOSVE-NEXT: ldr w9, [sp, #176] +; NONEON-NOSVE-NEXT: stp q20, q23, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w2, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w3, [sp, #56] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldr w18, [sp, #80] +; NONEON-NOSVE-NEXT: stp q22, q16, [sp, #112] +; NONEON-NOSVE-NEXT: stp q6, q5, [sp, #208] +; NONEON-NOSVE-NEXT: ldr w0, [sp, #88] +; NONEON-NOSVE-NEXT: ldr w16, [sp, #32] +; NONEON-NOSVE-NEXT: stp q3, q17, [sp, #240] +; NONEON-NOSVE-NEXT: ldr w23, [sp, #224] +; NONEON-NOSVE-NEXT: ldr w24, [sp, #232] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #256] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #264] +; NONEON-NOSVE-NEXT: ldr w27, [sp, #240] +; NONEON-NOSVE-NEXT: ldr w28, [sp, #248] +; NONEON-NOSVE-NEXT: strh w9, [sp, #308] +; NONEON-NOSVE-NEXT: ldr w21, [sp, #208] +; NONEON-NOSVE-NEXT: add w9, w27, w27 +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w8, [sp, #184] +; NONEON-NOSVE-NEXT: str q7, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w22, [sp, #216] +; NONEON-NOSVE-NEXT: ldr w17, [sp, #40] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w9, [sp, #304] +; NONEON-NOSVE-NEXT: add w9, w25, w25 +; NONEON-NOSVE-NEXT: strh w8, [sp, #310] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: ldr w19, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #306] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: ldr w20, [sp, #24] +; NONEON-NOSVE-NEXT: str q18, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #120] +; NONEON-NOSVE-NEXT: strh w8, [sp, #302] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: ldr w4, [sp, #64] +; NONEON-NOSVE-NEXT: strh w9, [sp, #300] +; NONEON-NOSVE-NEXT: add w9, w23, w23 +; NONEON-NOSVE-NEXT: ldr w7, [sp, #72] +; NONEON-NOSVE-NEXT: strh w8, [sp, #298] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #96] +; NONEON-NOSVE-NEXT: strh w9, [sp, #296] +; NONEON-NOSVE-NEXT: add w9, w21, w21 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #104] +; NONEON-NOSVE-NEXT: strh w8, [sp, #294] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #128] +; NONEON-NOSVE-NEXT: strh w9, [sp, #292] +; NONEON-NOSVE-NEXT: add w9, w19, w19 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #136] +; NONEON-NOSVE-NEXT: strh w8, [sp, #290] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: ldr w29, [sp, #144] +; NONEON-NOSVE-NEXT: strh w9, [sp, #288] +; NONEON-NOSVE-NEXT: add w9, w4, w4 +; NONEON-NOSVE-NEXT: ldr w30, [sp, #152] +; NONEON-NOSVE-NEXT: strh w8, [sp, #286] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: strh w9, [sp, #284] +; NONEON-NOSVE-NEXT: add w9, w2, w2 +; NONEON-NOSVE-NEXT: strh w8, [sp, #282] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: strh w9, [sp, #280] +; NONEON-NOSVE-NEXT: add w9, w18, w18 +; NONEON-NOSVE-NEXT: strh w8, [sp, #278] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: strh w9, [sp, #276] +; NONEON-NOSVE-NEXT: add w9, w16, w16 +; NONEON-NOSVE-NEXT: strh w8, [sp, #274] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: strh w9, [sp, #272] +; NONEON-NOSVE-NEXT: add w9, w14, w14 +; NONEON-NOSVE-NEXT: strh w8, [sp, #334] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: strh w9, [sp, #332] +; NONEON-NOSVE-NEXT: add w9, w12, w12 +; NONEON-NOSVE-NEXT: strh w8, [sp, #330] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: strh w9, [sp, #328] +; NONEON-NOSVE-NEXT: add w9, w10, w10 +; NONEON-NOSVE-NEXT: strh w8, [sp, #326] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w9, [sp, #324] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: strh w5, [sp, #318] +; NONEON-NOSVE-NEXT: add w5, w30, w30 +; NONEON-NOSVE-NEXT: strh w6, [sp, #316] +; NONEON-NOSVE-NEXT: add w6, w29, w29 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: strh w5, [sp, #314] +; NONEON-NOSVE-NEXT: ldp q1, q3, [sp, #272] +; NONEON-NOSVE-NEXT: strh w6, [sp, #312] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #416] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w8, [sp, #322] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #400] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: strh w9, [sp, #320] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #384] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q2, q0, [sp, #304] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #368] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #352] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q3, q2, [x1] +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #336] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q0, q1, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #432 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i64>, ptr %in %b = trunc <32 x i64> %a to <32 x i16> @@ -1058,7 +3129,13 @@ define <4 x i32> @trunc_v4i64_v4i32(ptr %in) nounwind { ; NONEON-NOSVE-LABEL: trunc_v4i64_v4i32: ; NONEON-NOSVE: // %bb.0: ; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #-48]! +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #16] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldp x8, x10, [sp] +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %a = load <4 x i64>, ptr %in %b = trunc <4 x i64> %a to <4 x i32> @@ -1085,13 +3162,34 @@ define void @trunc_v8i64_v8i32(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v8i64_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: sub sp, sp, #96 +; NONEON-NOSVE-NEXT: ldp q0, q1, [x0, #32] ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s +; NONEON-NOSVE-NEXT: stp q3, q1, [sp] +; NONEON-NOSVE-NEXT: stp q2, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w12, [sp] +; NONEON-NOSVE-NEXT: ldr w13, [sp, #8] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #40] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #24] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #56] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #72] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #64] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #88] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #80] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: add sp, sp, #96 ; NONEON-NOSVE-NEXT: ret %a = load <8 x i64>, ptr %in %b = trunc <8 x i64> %a to <8 x i32> @@ -1131,20 +3229,60 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v16i64_v16i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] -; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #96] +; NONEON-NOSVE-NEXT: sub sp, sp, #192 +; NONEON-NOSVE-NEXT: ldp q3, q2, [x0, #32] ; NONEON-NOSVE-NEXT: ldp q5, q4, [x0] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: ldp q6, q1, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: uzp1 v3.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v6.4s, v1.4s -; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: add v2.4s, v2.4s, v2.4s -; NONEON-NOSVE-NEXT: add v3.4s, v3.4s, v3.4s -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q2, [x1, #32] -; NONEON-NOSVE-NEXT: stp q3, q1, [x1] +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #96] +; NONEON-NOSVE-NEXT: stp q2, q4, [sp, #64] +; NONEON-NOSVE-NEXT: stp q3, q1, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #72] +; NONEON-NOSVE-NEXT: ldr w2, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w3, [sp, #104] +; NONEON-NOSVE-NEXT: stp q5, q7, [sp] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: ldr w4, [sp, #80] +; NONEON-NOSVE-NEXT: ldr w5, [sp, #88] +; NONEON-NOSVE-NEXT: stp q6, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w18, [sp] +; NONEON-NOSVE-NEXT: ldr w0, [sp, #8] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #168] +; NONEON-NOSVE-NEXT: add w9, w3, w3 +; NONEON-NOSVE-NEXT: add w8, w2, w2 +; NONEON-NOSVE-NEXT: ldr w16, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w17, [sp, #40] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #160] +; NONEON-NOSVE-NEXT: add w9, w5, w5 +; NONEON-NOSVE-NEXT: add w8, w4, w4 +; NONEON-NOSVE-NEXT: ldr w14, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #24] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #152] +; NONEON-NOSVE-NEXT: add w9, w0, w0 +; NONEON-NOSVE-NEXT: add w8, w18, w18 +; NONEON-NOSVE-NEXT: ldr w12, [sp, #48] +; NONEON-NOSVE-NEXT: ldr w13, [sp, #56] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #144] +; NONEON-NOSVE-NEXT: add w9, w17, w17 +; NONEON-NOSVE-NEXT: add w8, w16, w16 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w11, [sp, #120] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #136] +; NONEON-NOSVE-NEXT: add w9, w15, w15 +; NONEON-NOSVE-NEXT: add w8, w14, w14 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #128] +; NONEON-NOSVE-NEXT: add w9, w13, w13 +; NONEON-NOSVE-NEXT: add w8, w12, w12 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #184] +; NONEON-NOSVE-NEXT: add w9, w11, w11 +; NONEON-NOSVE-NEXT: add w8, w10, w10 +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #176] +; NONEON-NOSVE-NEXT: ldp q1, q3, [sp, #128] +; NONEON-NOSVE-NEXT: ldp q2, q0, [sp, #160] +; NONEON-NOSVE-NEXT: stp q3, q2, [x1] +; NONEON-NOSVE-NEXT: stp q0, q1, [x1, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #192 ; NONEON-NOSVE-NEXT: ret %a = load <16 x i64>, ptr %in %b = trunc <16 x i64> %a to <16 x i32> @@ -1206,34 +3344,145 @@ define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind { ; ; NONEON-NOSVE-LABEL: trunc_v32i64_v32i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #192] -; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #224] -; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #128] -; NONEON-NOSVE-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; NONEON-NOSVE-NEXT: ldp q16, q1, [x0, #160] -; NONEON-NOSVE-NEXT: uzp1 v4.4s, v5.4s, v4.4s -; NONEON-NOSVE-NEXT: ldp q17, q5, [x0, #64] -; NONEON-NOSVE-NEXT: uzp1 v6.4s, v7.4s, v6.4s +; NONEON-NOSVE-NEXT: sub sp, sp, #496 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0, #32] +; NONEON-NOSVE-NEXT: stp x28, x27, [sp, #416] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #192] +; NONEON-NOSVE-NEXT: stp x26, x25, [sp, #432] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: stp x24, x23, [sp, #448] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q23, q22, [x0, #224] +; NONEON-NOSVE-NEXT: stp x22, x21, [sp, #464] // 16-byte Folded Spill ; NONEON-NOSVE-NEXT: ldp q3, q2, [x0] -; NONEON-NOSVE-NEXT: ldp q18, q7, [x0, #96] -; NONEON-NOSVE-NEXT: uzp1 v1.4s, v16.4s, v1.4s -; NONEON-NOSVE-NEXT: uzp1 v5.4s, v17.4s, v5.4s -; NONEON-NOSVE-NEXT: ldp q17, q16, [x0, #32] -; NONEON-NOSVE-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; NONEON-NOSVE-NEXT: add v0.4s, v0.4s, v0.4s -; NONEON-NOSVE-NEXT: add v4.4s, v4.4s, v4.4s -; NONEON-NOSVE-NEXT: uzp1 v7.4s, v18.4s, v7.4s -; NONEON-NOSVE-NEXT: add v3.4s, v6.4s, v6.4s -; NONEON-NOSVE-NEXT: uzp1 v6.4s, v17.4s, v16.4s -; NONEON-NOSVE-NEXT: add v1.4s, v1.4s, v1.4s -; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #96] -; NONEON-NOSVE-NEXT: add v0.4s, v5.4s, v5.4s -; NONEON-NOSVE-NEXT: add v2.4s, v2.4s, v2.4s -; NONEON-NOSVE-NEXT: add v4.4s, v7.4s, v7.4s -; NONEON-NOSVE-NEXT: stp q3, q1, [x1, #64] -; NONEON-NOSVE-NEXT: add v1.4s, v6.4s, v6.4s -; NONEON-NOSVE-NEXT: stp q0, q4, [x1, #32] -; NONEON-NOSVE-NEXT: stp q2, q1, [x1] +; NONEON-NOSVE-NEXT: stp x20, x19, [sp, #480] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q5, q4, [x0, #96] +; NONEON-NOSVE-NEXT: stp x29, x30, [sp, #400] // 16-byte Folded Spill +; NONEON-NOSVE-NEXT: ldp q7, q6, [x0, #64] +; NONEON-NOSVE-NEXT: ldp q19, q18, [x0, #160] +; NONEON-NOSVE-NEXT: ldp q21, q20, [x0, #128] +; NONEON-NOSVE-NEXT: str q0, [sp, #192] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #192] +; NONEON-NOSVE-NEXT: stp q17, q23, [sp, #32] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #200] +; NONEON-NOSVE-NEXT: ldr w10, [sp, #32] +; NONEON-NOSVE-NEXT: stp q4, q6, [sp, #160] +; NONEON-NOSVE-NEXT: ldr w12, [sp, #48] +; NONEON-NOSVE-NEXT: add w6, w8, w8 +; NONEON-NOSVE-NEXT: add w5, w9, w9 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #40] +; NONEON-NOSVE-NEXT: stp q18, q20, [sp, #112] +; NONEON-NOSVE-NEXT: ldr w25, [sp, #160] +; NONEON-NOSVE-NEXT: ldr w26, [sp, #168] +; NONEON-NOSVE-NEXT: str q5, [sp, #144] +; NONEON-NOSVE-NEXT: ldr w21, [sp, #176] +; NONEON-NOSVE-NEXT: ldr w22, [sp, #184] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #208] +; NONEON-NOSVE-NEXT: ldr w23, [sp, #144] +; NONEON-NOSVE-NEXT: ldr w24, [sp, #152] +; NONEON-NOSVE-NEXT: str q3, [sp, #16] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #208] +; NONEON-NOSVE-NEXT: ldr w4, [sp, #112] +; NONEON-NOSVE-NEXT: stp w8, w10, [sp, #8] // 8-byte Folded Spill +; NONEON-NOSVE-NEXT: ldr w8, [sp, #216] +; NONEON-NOSVE-NEXT: ldr w27, [sp, #16] +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: ldr w28, [sp, #24] +; NONEON-NOSVE-NEXT: stp q22, q16, [sp, #64] +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w9, [sp, #344] +; NONEON-NOSVE-NEXT: add w9, w27, w27 +; NONEON-NOSVE-NEXT: str w8, [sp, #348] +; NONEON-NOSVE-NEXT: add w8, w28, w28 +; NONEON-NOSVE-NEXT: ldr w7, [sp, #120] +; NONEON-NOSVE-NEXT: stp q7, q21, [sp, #240] +; NONEON-NOSVE-NEXT: ldr w18, [sp, #128] +; NONEON-NOSVE-NEXT: ldr w0, [sp, #136] +; NONEON-NOSVE-NEXT: str w8, [sp, #340] +; NONEON-NOSVE-NEXT: add w8, w26, w26 +; NONEON-NOSVE-NEXT: ldr w19, [sp, #240] +; NONEON-NOSVE-NEXT: str w9, [sp, #336] +; NONEON-NOSVE-NEXT: add w9, w25, w25 +; NONEON-NOSVE-NEXT: ldr w20, [sp, #248] +; NONEON-NOSVE-NEXT: str w8, [sp, #332] +; NONEON-NOSVE-NEXT: add w8, w24, w24 +; NONEON-NOSVE-NEXT: ldr w16, [sp, #256] +; NONEON-NOSVE-NEXT: str w9, [sp, #328] +; NONEON-NOSVE-NEXT: add w9, w23, w23 +; NONEON-NOSVE-NEXT: ldr w17, [sp, #264] +; NONEON-NOSVE-NEXT: str q19, [sp, #96] +; NONEON-NOSVE-NEXT: ldr w14, [sp, #64] +; NONEON-NOSVE-NEXT: ldr w15, [sp, #72] +; NONEON-NOSVE-NEXT: str w8, [sp, #324] +; NONEON-NOSVE-NEXT: add w8, w22, w22 +; NONEON-NOSVE-NEXT: ldr w2, [sp, #96] +; NONEON-NOSVE-NEXT: str w9, [sp, #320] +; NONEON-NOSVE-NEXT: add w9, w21, w21 +; NONEON-NOSVE-NEXT: ldr w3, [sp, #104] +; NONEON-NOSVE-NEXT: str w8, [sp, #380] +; NONEON-NOSVE-NEXT: add w8, w20, w20 +; NONEON-NOSVE-NEXT: ldr w13, [sp, #56] +; NONEON-NOSVE-NEXT: str w9, [sp, #376] +; NONEON-NOSVE-NEXT: add w9, w19, w19 +; NONEON-NOSVE-NEXT: ldr w10, [sp, #80] +; NONEON-NOSVE-NEXT: str w8, [sp, #372] +; NONEON-NOSVE-NEXT: add w8, w7, w7 +; NONEON-NOSVE-NEXT: ldr w11, [sp, #88] +; NONEON-NOSVE-NEXT: str w9, [sp, #368] +; NONEON-NOSVE-NEXT: add w9, w4, w4 +; NONEON-NOSVE-NEXT: ldr w29, [sp, #224] +; NONEON-NOSVE-NEXT: str w8, [sp, #316] +; NONEON-NOSVE-NEXT: add w8, w3, w3 +; NONEON-NOSVE-NEXT: ldr w30, [sp, #232] +; NONEON-NOSVE-NEXT: str w9, [sp, #312] +; NONEON-NOSVE-NEXT: add w9, w2, w2 +; NONEON-NOSVE-NEXT: str w8, [sp, #308] +; NONEON-NOSVE-NEXT: add w8, w0, w0 +; NONEON-NOSVE-NEXT: str w9, [sp, #304] +; NONEON-NOSVE-NEXT: add w9, w18, w18 +; NONEON-NOSVE-NEXT: str w8, [sp, #396] +; NONEON-NOSVE-NEXT: add w8, w17, w17 +; NONEON-NOSVE-NEXT: str w9, [sp, #392] +; NONEON-NOSVE-NEXT: add w9, w16, w16 +; NONEON-NOSVE-NEXT: str w8, [sp, #388] +; NONEON-NOSVE-NEXT: add w8, w15, w15 +; NONEON-NOSVE-NEXT: str w9, [sp, #384] +; NONEON-NOSVE-NEXT: add w9, w14, w14 +; NONEON-NOSVE-NEXT: str w8, [sp, #284] +; NONEON-NOSVE-NEXT: add w8, w13, w13 +; NONEON-NOSVE-NEXT: str w9, [sp, #280] +; NONEON-NOSVE-NEXT: add w9, w12, w12 +; NONEON-NOSVE-NEXT: str w8, [sp, #276] +; NONEON-NOSVE-NEXT: add w8, w11, w11 +; NONEON-NOSVE-NEXT: str w9, [sp, #272] +; NONEON-NOSVE-NEXT: add w9, w10, w10 +; NONEON-NOSVE-NEXT: str w8, [sp, #300] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: str w9, [sp, #296] +; NONEON-NOSVE-NEXT: ldr w9, [sp, #12] // 4-byte Folded Reload +; NONEON-NOSVE-NEXT: add w8, w8, w8 +; NONEON-NOSVE-NEXT: str w5, [sp, #364] +; NONEON-NOSVE-NEXT: add w5, w30, w30 +; NONEON-NOSVE-NEXT: add w9, w9, w9 +; NONEON-NOSVE-NEXT: str w6, [sp, #360] +; NONEON-NOSVE-NEXT: add w6, w29, w29 +; NONEON-NOSVE-NEXT: str w5, [sp, #356] +; NONEON-NOSVE-NEXT: ldp q6, q3, [sp, #304] +; NONEON-NOSVE-NEXT: str w6, [sp, #352] +; NONEON-NOSVE-NEXT: ldp q4, q7, [sp, #368] +; NONEON-NOSVE-NEXT: str w8, [sp, #292] +; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #336] +; NONEON-NOSVE-NEXT: str w9, [sp, #288] +; NONEON-NOSVE-NEXT: ldp x20, x19, [sp, #480] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp q5, q2, [sp, #272] +; NONEON-NOSVE-NEXT: stp q4, q3, [x1, #32] +; NONEON-NOSVE-NEXT: stp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldp x22, x21, [sp, #464] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q7, q6, [x1, #64] +; NONEON-NOSVE-NEXT: ldp x24, x23, [sp, #448] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: stp q2, q5, [x1, #96] +; NONEON-NOSVE-NEXT: ldp x26, x25, [sp, #432] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x28, x27, [sp, #416] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: ldp x29, x30, [sp, #400] // 16-byte Folded Reload +; NONEON-NOSVE-NEXT: add sp, sp, #496 ; NONEON-NOSVE-NEXT: ret %a = load <32 x i64>, ptr %in %b = trunc <32 x i64> %a to <32 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll index 874af15e211177..323f5f56a2c085 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll @@ -18,8 +18,17 @@ define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v1.8b, v0.8b, v0.8b, #6 -; NONEON-NOSVE-NEXT: trn1 v0.4h, v0.4h, v1.4h +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #6] +; NONEON-NOSVE-NEXT: strh w8, [sp, #10] +; NONEON-NOSVE-NEXT: ldrh w8, [sp] +; NONEON-NOSVE-NEXT: strh w8, [sp, #8] +; NONEON-NOSVE-NEXT: ldur w8, [sp, #2] +; NONEON-NOSVE-NEXT: ror w8, w8, #16 +; NONEON-NOSVE-NEXT: str w8, [sp, #12] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> ret <4 x i8> %ret @@ -38,7 +47,19 @@ define <8 x i8> @shuffle_ext_byone_v8i8(<8 x i8> %op1, <8 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #7 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #22] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: sturh w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: stur w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <8 x i8> %op1, <8 x i8> %op2, <8 x i32> ret <8 x i8> %ret @@ -57,7 +78,20 @@ define <16 x i8> @shuffle_ext_byone_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #15 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #30] +; NONEON-NOSVE-NEXT: strb w8, [sp, #47] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: sturh w8, [sp, #45] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: stur w8, [sp, #41] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: stur x8, [sp, #33] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #15] +; NONEON-NOSVE-NEXT: strb w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <16 x i8> %op1, <16 x i8> %op2, <16 x i32> @@ -80,11 +114,35 @@ define void @shuffle_ext_byone_v32i8(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v32i8: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #15 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #15 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #14] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: strb w8, [sp, #31] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: sturh w8, [sp, #29] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stur w8, [sp, #25] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: stur x8, [sp, #17] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #63] +; NONEON-NOSVE-NEXT: strb w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #62] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strb w8, [sp, #79] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: sturh w8, [sp, #77] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: stur w8, [sp, #73] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: stur x8, [sp, #65] +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #47] +; NONEON-NOSVE-NEXT: strb w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -107,7 +165,12 @@ define <2 x i16> @shuffle_ext_byone_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: rev64 v0.2s, v0.2s +; NONEON-NOSVE-NEXT: str d0, [sp, #-16]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 +; NONEON-NOSVE-NEXT: ldp w9, w8, [sp] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #8] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x i16> %op1, <2 x i16> %op2, <2 x i32> ret <2 x i16> %ret @@ -126,7 +189,17 @@ define <4 x i16> @shuffle_ext_byone_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #6 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #20] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: stur w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x i16> %op1, <4 x i16> %op2, <4 x i32> ret <4 x i16> %ret @@ -145,7 +218,18 @@ define <8 x i16> @shuffle_ext_byone_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #28] +; NONEON-NOSVE-NEXT: strh w8, [sp, #46] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: stur w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: stur x8, [sp, #34] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #14] +; NONEON-NOSVE-NEXT: strh w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <8 x i16> %op1, <8 x i16> %op2, <8 x i32> ret <8 x i16> %ret @@ -167,11 +251,31 @@ define void @shuffle_ext_byone_v16i16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16i16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #14 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #12] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: strh w8, [sp, #30] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stur w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: stur x8, [sp, #18] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #62] +; NONEON-NOSVE-NEXT: strh w8, [sp, #16] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #60] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: strh w8, [sp, #78] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: stur w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: stur x8, [sp, #66] +; NONEON-NOSVE-NEXT: ldrh w8, [sp, #46] +; NONEON-NOSVE-NEXT: strh w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -194,7 +298,13 @@ define <2 x i32> @shuffle_ext_byone_v2i32(<2 x i32> %op1, <2 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x i32> %op1, <2 x i32> %op2, <2 x i32> ret <2 x i32> %ret @@ -213,7 +323,16 @@ define <4 x i32> @shuffle_ext_byone_v4i32(<4 x i32> %op1, <4 x i32> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: str w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: stur x8, [sp, #36] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] +; NONEON-NOSVE-NEXT: str w8, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x i32> %op1, <4 x i32> %op2, <4 x i32> ret <4 x i32> %ret @@ -235,11 +354,26 @@ define void @shuffle_ext_byone_v8i32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8i32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #12 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: str w8, [sp, #28] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: stur x8, [sp, #20] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #56] +; NONEON-NOSVE-NEXT: str w8, [sp, #76] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: str w9, [sp, #16] +; NONEON-NOSVE-NEXT: stur x8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #44] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: str w8, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -261,7 +395,12 @@ define <2 x i64> @shuffle_ext_byone_v2i64(<2 x i64> %op1, <2 x i64> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #8] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x i64> %op1, <2 x i64> %op2, <2 x i32> ret <2 x i64> %ret @@ -283,11 +422,20 @@ define void @shuffle_ext_byone_v4i64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4i64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #8 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x9, [sp] +; NONEON-NOSVE-NEXT: ldp x11, x10, [sp, #48] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #40] +; NONEON-NOSVE-NEXT: stp x10, x9, [sp, #16] +; NONEON-NOSVE-NEXT: stp x8, x11, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -309,7 +457,17 @@ define <4 x half> @shuffle_ext_byone_v4f16(<4 x half> %op1, <4 x half> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #6 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #20] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #16] +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: stur w8, [sp, #26] +; NONEON-NOSVE-NEXT: str h0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x half> %op1, <4 x half> %op2, <4 x i32> ret <4 x half> %ret @@ -327,7 +485,18 @@ define <8 x half> @shuffle_ext_byone_v8f16(<8 x half> %op1, <8 x half> %op2) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #28] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #24] +; NONEON-NOSVE-NEXT: str h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #14] +; NONEON-NOSVE-NEXT: stur w8, [sp, #42] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: str h0, [sp, #32] +; NONEON-NOSVE-NEXT: stur x8, [sp, #34] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <8 x half> %op1, <8 x half> %op2, <8 x i32> ret <8 x half> %ret @@ -347,11 +516,31 @@ define void @shuffle_ext_byone_v16f16(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v16f16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #14 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #14 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldr h0, [sp, #12] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: str h0, [sp, #30] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #62] +; NONEON-NOSVE-NEXT: stur w8, [sp, #26] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str h0, [sp, #16] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #60] +; NONEON-NOSVE-NEXT: stur x8, [sp, #18] +; NONEON-NOSVE-NEXT: ldr w8, [sp, #56] +; NONEON-NOSVE-NEXT: str h0, [sp, #78] +; NONEON-NOSVE-NEXT: ldr h0, [sp, #46] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: stur w8, [sp, #74] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: str h0, [sp, #64] +; NONEON-NOSVE-NEXT: stur x8, [sp, #66] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -373,7 +562,13 @@ define <2 x float> @shuffle_ext_byone_v2f32(<2 x float> %op1, <2 x float> %op2) ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.8b, v0.8b, v1.8b, #4 +; NONEON-NOSVE-NEXT: sub sp, sp, #32 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 32 +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #8] +; NONEON-NOSVE-NEXT: ldp w8, w9, [sp, #12] +; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #24] +; NONEON-NOSVE-NEXT: add sp, sp, #32 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x float> %op1, <2 x float> %op2, <2 x i32> ret <2 x float> %ret @@ -391,7 +586,16 @@ define <4 x float> @shuffle_ext_byone_v4f32(<4 x float> %op1, <4 x float> %op2) ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #24] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #16] +; NONEON-NOSVE-NEXT: str s0, [sp, #44] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #12] +; NONEON-NOSVE-NEXT: stur x8, [sp, #36] +; NONEON-NOSVE-NEXT: str s0, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <4 x float> %op1, <4 x float> %op2, <4 x i32> ret <4 x float> %ret @@ -411,11 +615,26 @@ define void @shuffle_ext_byone_v8f32(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v8f32: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #12 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #12 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldr s0, [sp, #8] +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr x8, [sp] +; NONEON-NOSVE-NEXT: str s0, [sp, #28] +; NONEON-NOSVE-NEXT: ldp s0, s1, [sp, #56] +; NONEON-NOSVE-NEXT: stur x8, [sp, #20] +; NONEON-NOSVE-NEXT: ldr x8, [sp, #48] +; NONEON-NOSVE-NEXT: str s0, [sp, #76] +; NONEON-NOSVE-NEXT: ldr s0, [sp, #44] +; NONEON-NOSVE-NEXT: str s1, [sp, #16] +; NONEON-NOSVE-NEXT: stur x8, [sp, #68] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: str s0, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -436,7 +655,12 @@ define <2 x double> @shuffle_ext_byone_v2f64(<2 x double> %op1, <2 x double> %op ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v2f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 +; NONEON-NOSVE-NEXT: stp q0, q1, [sp, #-48]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 48 +; NONEON-NOSVE-NEXT: ldp x8, x9, [sp, #8] +; NONEON-NOSVE-NEXT: stp x8, x9, [sp, #32] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #32] +; NONEON-NOSVE-NEXT: add sp, sp, #48 ; NONEON-NOSVE-NEXT: ret %ret = shufflevector <2 x double> %op1, <2 x double> %op2, <2 x i32> ret <2 x double> %ret @@ -456,11 +680,20 @@ define void @shuffle_ext_byone_v4f64(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_v4f64: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q1, q2, [x1] -; NONEON-NOSVE-NEXT: ldr q0, [x0, #16] -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v1.16b, #8 -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v2.16b, #8 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x1] +; NONEON-NOSVE-NEXT: ldr q2, [x0, #16] +; NONEON-NOSVE-NEXT: str q0, [sp, #-80]! +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: stp q2, q1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d1, [sp] +; NONEON-NOSVE-NEXT: ldp d3, d2, [sp, #48] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] +; NONEON-NOSVE-NEXT: stp d2, d1, [sp, #16] +; NONEON-NOSVE-NEXT: stp d0, d3, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #64] ; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -483,11 +716,21 @@ define void @shuffle_ext_byone_reverse(ptr %a, ptr %b) { ; ; NONEON-NOSVE-LABEL: shuffle_ext_byone_reverse: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: ldp q0, q2, [x0] -; NONEON-NOSVE-NEXT: ldr q1, [x1, #16] -; NONEON-NOSVE-NEXT: ext v1.16b, v1.16b, v0.16b, #8 -; NONEON-NOSVE-NEXT: ext v0.16b, v0.16b, v2.16b, #8 -; NONEON-NOSVE-NEXT: stp q1, q0, [x0] +; NONEON-NOSVE-NEXT: sub sp, sp, #80 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: ldp q1, q0, [x0] +; NONEON-NOSVE-NEXT: ldr q2, [x1, #16] +; NONEON-NOSVE-NEXT: str q2, [sp] +; NONEON-NOSVE-NEXT: stp q1, q0, [sp, #32] +; NONEON-NOSVE-NEXT: ldp d0, d1, [sp, #40] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr d1, [sp, #32] +; NONEON-NOSVE-NEXT: ldr d0, [sp, #8] +; NONEON-NOSVE-NEXT: stp d0, d1, [sp, #16] +; NONEON-NOSVE-NEXT: ldr q1, [sp, #64] +; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: stp q0, q1, [x0] +; NONEON-NOSVE-NEXT: add sp, sp, #80 ; NONEON-NOSVE-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll index e69f59aedc026f..67cdde718e391f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll @@ -43,7 +43,8 @@ define <2 x i64> @fixed_vec_zero_constant() { ; ; NONEON-NOSVE-LABEL: fixed_vec_zero_constant: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI2_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI2_0] ; NONEON-NOSVE-NEXT: ret ret <2 x i64> zeroinitializer } @@ -57,7 +58,8 @@ define <2 x double> @fixed_vec_fp_zero_constant() { ; ; NONEON-NOSVE-LABEL: fixed_vec_fp_zero_constant: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: movi v0.2d, #0000000000000000 +; NONEON-NOSVE-NEXT: adrp x8, .LCPI3_0 +; NONEON-NOSVE-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] ; NONEON-NOSVE-NEXT: ret ret <2 x double> }